1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
5 #ifndef OPENCV_HAL_INTRIN_AVX_HPP
6 #define OPENCV_HAL_INTRIN_AVX_HPP
9 #define CV_SIMD256_64F 1
16 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
18 ///////// Utils ////////////
20 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
21 { return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
23 inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
24 { return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
26 inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
27 { return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
29 inline int _v_cvtsi256_si32(const __m256i& a)
30 { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
32 inline __m256i _v256_shuffle_odd_64(const __m256i& v)
33 { return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
35 inline __m256d _v256_shuffle_odd_64(const __m256d& v)
36 { return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
39 inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
40 { return _mm256_permute2x128_si256(a, b, imm); }
43 inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
44 { return _mm256_permute2f128_ps(a, b, imm); }
47 inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
48 { return _mm256_permute2f128_pd(a, b, imm); }
50 template<int imm, typename _Tpvec>
51 inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
52 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
55 inline __m256i _v256_permute4x64(const __m256i& a)
56 { return _mm256_permute4x64_epi64(a, imm); }
59 inline __m256d _v256_permute4x64(const __m256d& a)
60 { return _mm256_permute4x64_pd(a, imm); }
62 template<int imm, typename _Tpvec>
63 inline _Tpvec v256_permute4x64(const _Tpvec& a)
64 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
66 inline __m128i _v256_extract_high(const __m256i& v)
67 { return _mm256_extracti128_si256(v, 1); }
69 inline __m128 _v256_extract_high(const __m256& v)
70 { return _mm256_extractf128_ps(v, 1); }
72 inline __m128d _v256_extract_high(const __m256d& v)
73 { return _mm256_extractf128_pd(v, 1); }
75 inline __m128i _v256_extract_low(const __m256i& v)
76 { return _mm256_castsi256_si128(v); }
78 inline __m128 _v256_extract_low(const __m256& v)
79 { return _mm256_castps256_ps128(v); }
81 inline __m128d _v256_extract_low(const __m256d& v)
82 { return _mm256_castpd256_pd128(v); }
84 ///////// Types ////////////
88 typedef uchar lane_type;
92 explicit v_uint8x32(__m256i v) : val(v) {}
93 v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
94 uchar v4, uchar v5, uchar v6, uchar v7,
95 uchar v8, uchar v9, uchar v10, uchar v11,
96 uchar v12, uchar v13, uchar v14, uchar v15,
97 uchar v16, uchar v17, uchar v18, uchar v19,
98 uchar v20, uchar v21, uchar v22, uchar v23,
99 uchar v24, uchar v25, uchar v26, uchar v27,
100 uchar v28, uchar v29, uchar v30, uchar v31)
102 val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
103 (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
104 (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
105 (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
106 (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
107 (char)v28, (char)v29, (char)v30, (char)v31);
109 v_uint8x32() : val(_mm256_setzero_si256()) {}
110 uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
115 typedef schar lane_type;
116 enum { nlanes = 32 };
119 explicit v_int8x32(__m256i v) : val(v) {}
120 v_int8x32(schar v0, schar v1, schar v2, schar v3,
121 schar v4, schar v5, schar v6, schar v7,
122 schar v8, schar v9, schar v10, schar v11,
123 schar v12, schar v13, schar v14, schar v15,
124 schar v16, schar v17, schar v18, schar v19,
125 schar v20, schar v21, schar v22, schar v23,
126 schar v24, schar v25, schar v26, schar v27,
127 schar v28, schar v29, schar v30, schar v31)
129 val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
130 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
131 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
133 v_int8x32() : val(_mm256_setzero_si256()) {}
134 schar get0() const { return (schar)_v_cvtsi256_si32(val); }
139 typedef ushort lane_type;
140 enum { nlanes = 16 };
143 explicit v_uint16x16(__m256i v) : val(v) {}
144 v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
145 ushort v4, ushort v5, ushort v6, ushort v7,
146 ushort v8, ushort v9, ushort v10, ushort v11,
147 ushort v12, ushort v13, ushort v14, ushort v15)
149 val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
150 (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
151 (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
153 v_uint16x16() : val(_mm256_setzero_si256()) {}
154 ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
159 typedef short lane_type;
160 enum { nlanes = 16 };
163 explicit v_int16x16(__m256i v) : val(v) {}
164 v_int16x16(short v0, short v1, short v2, short v3,
165 short v4, short v5, short v6, short v7,
166 short v8, short v9, short v10, short v11,
167 short v12, short v13, short v14, short v15)
169 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
170 v8, v9, v10, v11, v12, v13, v14, v15);
172 v_int16x16() : val(_mm256_setzero_si256()) {}
173 short get0() const { return (short)_v_cvtsi256_si32(val); }
178 typedef unsigned lane_type;
182 explicit v_uint32x8(__m256i v) : val(v) {}
183 v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
184 unsigned v4, unsigned v5, unsigned v6, unsigned v7)
186 val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
187 (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
189 v_uint32x8() : val(_mm256_setzero_si256()) {}
190 unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
195 typedef int lane_type;
199 explicit v_int32x8(__m256i v) : val(v) {}
200 v_int32x8(int v0, int v1, int v2, int v3,
201 int v4, int v5, int v6, int v7)
203 val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
205 v_int32x8() : val(_mm256_setzero_si256()) {}
206 int get0() const { return _v_cvtsi256_si32(val); }
211 typedef float lane_type;
215 explicit v_float32x8(__m256 v) : val(v) {}
216 v_float32x8(float v0, float v1, float v2, float v3,
217 float v4, float v5, float v6, float v7)
219 val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
221 v_float32x8() : val(_mm256_setzero_ps()) {}
222 float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
227 typedef uint64 lane_type;
231 explicit v_uint64x4(__m256i v) : val(v) {}
232 v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
233 { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
234 v_uint64x4() : val(_mm256_setzero_si256()) {}
236 { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
241 typedef int64 lane_type;
245 explicit v_int64x4(__m256i v) : val(v) {}
246 v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
247 { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
248 v_int64x4() : val(_mm256_setzero_si256()) {}
249 int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
254 typedef double lane_type;
258 explicit v_float64x4(__m256d v) : val(v) {}
259 v_float64x4(double v0, double v1, double v2, double v3)
260 { val = _mm256_setr_pd(v0, v1, v2, v3); }
261 v_float64x4() : val(_mm256_setzero_pd()) {}
262 double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
267 typedef short lane_type;
268 enum { nlanes = 16 };
271 explicit v_float16x16(__m256i v) : val(v) {}
272 v_float16x16(short v0, short v1, short v2, short v3,
273 short v4, short v5, short v6, short v7,
274 short v8, short v9, short v10, short v11,
275 short v12, short v13, short v14, short v15)
277 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
279 v_float16x16() : val(_mm256_setzero_si256()) {}
280 short get0() const { return (short)_v_cvtsi256_si32(val); }
282 inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
283 inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
285 //////////////// Load and store operations ///////////////
287 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
288 inline _Tpvec v256_load(const _Tp* ptr) \
289 { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
290 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
291 { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
292 inline _Tpvec v256_load_low(const _Tp* ptr) \
294 __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
295 return _Tpvec(_mm256_castsi128_si256(v128)); \
297 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
299 __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
300 __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
301 return _Tpvec(_v256_combine(vlo, vhi)); \
303 inline void v_store(_Tp* ptr, const _Tpvec& a) \
304 { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
305 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
306 { _mm256_store_si256((__m256i*)ptr, a.val); } \
307 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
308 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
309 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
310 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
312 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
313 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
314 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
315 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short)
316 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned)
317 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int)
318 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
319 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
321 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
322 inline _Tpvec v256_load(const _Tp* ptr) \
323 { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
324 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
325 { return _Tpvec(_mm256_load_##suffix(ptr)); } \
326 inline _Tpvec v256_load_low(const _Tp* ptr) \
328 return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
329 (_mm_loadu_##suffix(ptr))); \
331 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
333 halfreg vlo = _mm_loadu_##suffix(ptr0); \
334 halfreg vhi = _mm_loadu_##suffix(ptr1); \
335 return _Tpvec(_v256_combine(vlo, vhi)); \
337 inline void v_store(_Tp* ptr, const _Tpvec& a) \
338 { _mm256_storeu_##suffix(ptr, a.val); } \
339 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
340 { _mm256_store_##suffix(ptr, a.val); } \
341 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
342 { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
343 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
344 { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
346 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128)
347 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
349 #define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
350 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
351 { return _Tpvec(cast(a.val)); }
353 #define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
354 inline _Tpvec v256_setzero_##suffix() \
355 { return _Tpvec(_mm256_setzero_si256()); } \
356 inline _Tpvec v256_setall_##suffix(_Tp v) \
357 { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
358 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
359 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
360 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
361 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
362 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
363 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
364 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
365 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
366 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
367 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
369 OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char)
370 OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char)
371 OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short)
372 OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short)
373 OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int)
374 OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int)
375 OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
376 OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
378 #define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
379 inline _Tpvec v256_setzero_##suffix() \
380 { return _Tpvec(_mm256_setzero_##zsuffix()); } \
381 inline _Tpvec v256_setall_##suffix(_Tp v) \
382 { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
383 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
384 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
385 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
386 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
387 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
388 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
389 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
390 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
392 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps)
393 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
395 inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
397 inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
398 { return v_float32x8(_mm256_castpd_ps(a.val)); }
400 inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
402 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
403 { return v_float64x4(_mm256_castps_pd(a.val)); }
405 inline v_float16x16 v256_load_f16(const short* ptr)
406 { return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
407 inline v_float16x16 v256_load_f16_aligned(const short* ptr)
408 { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
410 inline void v_store(short* ptr, const v_float16x16& a)
411 { _mm256_storeu_si256((__m256i*)ptr, a.val); }
412 inline void v_store_aligned(short* ptr, const v_float16x16& a)
413 { _mm256_store_si256((__m256i*)ptr, a.val); }
416 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
417 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
418 { return _Tpvec(perm(a.val, b.val, 0x20)); } \
419 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
420 { return _Tpvec(perm(a.val, b.val, 0x31)); } \
421 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
422 _Tpvec& c, _Tpvec& d) \
423 { c = v_combine_low(a, b); d = v_combine_high(a, b); }
425 #define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \
426 OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \
427 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \
428 _Tpvec& b0, _Tpvec& b1) \
430 __m256i v0 = _v256_shuffle_odd_64(a0.val); \
431 __m256i v1 = _v256_shuffle_odd_64(a1.val); \
432 b0.val = _mm256_unpacklo_##suffix(v0, v1); \
433 b1.val = _mm256_unpackhi_##suffix(v0, v1); \
436 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8)
437 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8)
438 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
439 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16)
440 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32)
441 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32)
442 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64)
443 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64)
444 OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
445 OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
447 inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
449 __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
450 __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
451 v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
454 inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
456 __m256d v0 = _v_shuffle_odd_64(a0.val);
457 __m256d v1 = _v_shuffle_odd_64(a1.val);
458 b0.val = _mm256_unpacklo_pd(v0, v1);
459 b1.val = _mm256_unpackhi_pd(v0, v1);
462 //////////////// Variant Value reordering ///////////////
465 #define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
466 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
467 { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
468 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
469 { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
471 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
472 OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
473 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
474 OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
475 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
476 OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
477 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
478 OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
479 OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
480 OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
483 #define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
485 inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
486 { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
488 OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
489 OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
490 OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
491 OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
492 OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
493 OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
496 inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
499 enum {M1 = (M0 | (M0 << 2)) & 0x33};
500 enum {M2 = (M1 | (M1 << 1)) & 0x55};
501 enum {MM = M2 | (M2 << 1)};
502 return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
505 inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
506 { return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
509 // todo: emluate 64bit
510 #define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
512 inline _Tpvec v256_shuffle(const _Tpvec& a) \
513 { return _Tpvec(_mm256_##intrin(a.val, m)); }
515 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
516 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
517 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
518 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
520 template<typename _Tpvec>
521 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
523 ab0 = v256_unpacklo(a, b);
524 ab1 = v256_unpackhi(a, b);
527 template<typename _Tpvec>
528 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
529 { return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
531 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
532 { return v256_blend<0xf0>(a, b); }
534 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
535 { return v256_blend<0xc>(a, b); }
537 template<typename _Tpvec>
538 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
539 { return v256_permute2x128<0x21>(a, b); }
541 template<typename _Tpvec>
542 inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
543 { return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
544 inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
545 { return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
546 // todo: emulate float32
548 template<typename _Tpvec>
549 inline _Tpvec v256_swap_halves(const _Tpvec& a)
550 { return v256_permute2x128<1>(a, a); }
552 template<typename _Tpvec>
553 inline _Tpvec v256_reverse_64(const _Tpvec& a)
554 { return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
557 #define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
558 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
559 { return v256_permute2x128<0x20>(a, b); } \
560 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
561 { return v256_permute2x128<0x31>(a, b); } \
562 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
563 _Tpvec& c, _Tpvec& d) \
565 _Tpvec a1b0 = v256_alignr_128(a, b); \
566 c = v256_combine_diagonal(a, a1b0); \
567 d = v256_combine_diagonal(a1b0, b); \
569 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
570 _Tpvec& ab0, _Tpvec& ab1) \
572 _Tpvec ab0ab2, ab1ab3; \
573 v256_zip(a, b, ab0ab2, ab1ab3); \
574 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
577 OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
578 OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
579 OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
580 OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
581 OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
582 OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
583 OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
584 OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
585 OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
586 OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
588 ////////// Arithmetic, bitwise and comparison operations /////////
590 /* Element-wise binary and unary operations */
593 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
594 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
595 { return _Tpvec(intrin(a.val, b.val)); } \
596 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
597 { a.val = intrin(a.val, b.val); return a; }
599 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
600 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
601 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
602 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
603 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
604 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
605 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
606 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
607 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
608 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16)
609 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
610 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
611 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
612 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
613 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
614 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
615 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
616 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
617 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
618 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
620 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
621 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
622 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
623 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
624 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
625 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
626 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
627 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
629 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
630 v_int32x8& c, v_int32x8& d)
632 v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
635 v_zip(a * b, vhi, v0, v1);
637 c = v_reinterpret_as_s32(v0);
638 d = v_reinterpret_as_s32(v1);
641 inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
642 v_uint32x8& c, v_uint32x8& d)
644 v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
647 v_zip(a * b, vhi, v0, v1);
649 c = v_reinterpret_as_u32(v0);
650 d = v_reinterpret_as_u32(v1);
653 inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
654 v_uint64x4& c, v_uint64x4& d)
656 __m256i v0 = _mm256_mul_epu32(a.val, b.val);
657 __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
658 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
662 /** Non-saturating arithmetics **/
663 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
664 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
665 { return _Tpvec(intrin(a.val, b.val)); }
667 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
668 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
669 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
670 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
671 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
672 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
673 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
674 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
676 /** Bitwise shifts **/
677 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
678 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
679 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
680 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
681 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
682 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
683 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
684 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
685 { return _Tpsvec(srai(a.val, imm)); } \
687 inline _Tpuvec v_shl(const _Tpuvec& a) \
688 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
690 inline _Tpsvec v_shl(const _Tpsvec& a) \
691 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
693 inline _Tpuvec v_shr(const _Tpuvec& a) \
694 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
696 inline _Tpsvec v_shr(const _Tpsvec& a) \
697 { return _Tpsvec(srai(a.val, imm)); }
699 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
700 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
702 inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
704 __m256i d = _mm256_set1_epi64x((int64)1 << 63);
705 __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
706 return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
708 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
711 /** Bitwise logic **/
712 #define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
713 OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
714 OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
715 OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
716 inline _Tpvec operator ~ (const _Tpvec& a) \
717 { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
719 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
720 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
721 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
722 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
723 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
724 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
725 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
726 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
727 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
728 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
731 #define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
732 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
733 { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
735 OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
736 OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
737 OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
738 OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
739 OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
740 OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
741 OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
742 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
745 #define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
746 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
747 { return ~(a == b); } \
748 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
750 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
751 { return ~(a < b); } \
752 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
755 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
756 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
757 { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
758 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
760 __m256i smask = _mm256_set1_##suffix(sbit); \
761 return _Tpuvec(_mm256_cmpgt_##suffix( \
762 _mm256_xor_si256(a.val, smask), \
763 _mm256_xor_si256(b.val, smask))); \
765 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
766 { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
767 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
768 { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
769 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
770 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
772 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128)
773 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
774 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
776 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
777 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
778 { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
779 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
780 { return ~(a == b); }
782 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
783 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
785 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
786 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
787 { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
789 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
790 OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
791 OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
792 OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
793 OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
794 OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
795 OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
797 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
798 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
801 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
802 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
803 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
804 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
805 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
806 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
807 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
808 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
809 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
810 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
811 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
812 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
813 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
814 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
815 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
816 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
820 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
822 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
828 case 16: return v_uint8x32(swap);
831 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
832 if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
838 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
840 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
846 case 16: return v_uint8x32(swap);
849 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
850 if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
856 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
859 // ESAC control[3] ? [127:0] = 0
860 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
867 res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
869 res.val = _mm256_slli_si256(swapz, imm - 16);
876 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
879 // ESAC control[3] ? [127:0] = 0
880 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
887 res.val = _mm256_alignr_epi8(swapz, a.val, imm);
889 res.val = _mm256_srli_si256(swapz, imm - 16);
895 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
897 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
899 const int w = sizeof(typename _Tpvec::lane_type); \
900 v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a), \
901 v_reinterpret_as_u8(b)); \
902 return _Tpvec(cast(ret.val)); \
905 inline _Tpvec intrin(const _Tpvec& a) \
907 const int w = sizeof(typename _Tpvec::lane_type); \
908 v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
909 return _Tpvec(cast(ret.val)); \
912 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
913 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
914 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
916 OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
917 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
918 OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
919 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
920 OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
921 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
922 OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
924 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
925 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
926 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
927 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
929 ////////// Reduce and mask /////////
932 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
933 inline sctype v_reduce_##func(const _Tpvec& a) \
935 __m128i v0 = _v256_extract_low(a.val); \
936 __m128i v1 = _v256_extract_high(a.val); \
937 v0 = intrin(v0, v1); \
938 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
939 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
940 v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
941 return (sctype) _mm_cvtsi128_si32(v0); \
944 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
945 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16)
946 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
947 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16)
949 #define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
950 inline sctype v_reduce_##func(const _Tpvec& a) \
952 __m128i v0 = _v256_extract_low(a.val); \
953 __m128i v1 = _v256_extract_high(a.val); \
954 v0 = intrin(v0, v1); \
955 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
956 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
957 return (sctype) _mm_cvtsi128_si32(v0); \
960 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
961 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32)
962 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
963 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
965 #define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
966 inline float v_reduce_##func(const v_float32x8& a) \
968 __m128 v0 = _v256_extract_low(a.val); \
969 __m128 v1 = _v256_extract_high(a.val); \
970 v0 = intrin(v0, v1); \
971 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
972 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
973 return _mm_cvtss_f32(v0); \
976 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
977 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
979 inline ushort v_reduce_sum(const v_uint16x16& a)
981 __m128i a0 = _v256_extract_low(a.val);
982 __m128i a1 = _v256_extract_high(a.val);
984 __m128i s0 = _mm_adds_epu16(a0, a1);
985 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
986 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
987 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
989 return (ushort)_mm_cvtsi128_si32(s0);
992 inline short v_reduce_sum(const v_int16x16& a)
994 __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
995 s0 = _mm256_hadds_epi16(s0, s0);
996 s0 = _mm256_hadds_epi16(s0, s0);
998 __m128i s1 = _v256_extract_high(s0);
999 s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
1001 return (short)_mm_cvtsi128_si32(s1);
1004 inline int v_reduce_sum(const v_int32x8& a)
1006 __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1007 s0 = _mm256_hadd_epi32(s0, s0);
1009 __m128i s1 = _v256_extract_high(s0);
1010 s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1012 return _mm_cvtsi128_si32(s1);
1015 inline unsigned v_reduce_sum(const v_uint32x8& a)
1016 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
1018 inline float v_reduce_sum(const v_float32x8& a)
1020 __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1021 s0 = _mm256_hadd_ps(s0, s0);
1023 __m128 s1 = _v256_extract_high(s0);
1024 s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1026 return _mm_cvtss_f32(s1);
1029 inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1030 const v_float32x8& c, const v_float32x8& d)
1032 __m256 ab = _mm256_hadd_ps(a.val, b.val);
1033 __m256 cd = _mm256_hadd_ps(c.val, d.val);
1034 return v_float32x8(_mm256_hadd_ps(ab, cd));
1038 #define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
1039 inline v_uint32x8 v_popcount(const _Tpvec& a) \
1041 const v_uint32x8 m1 = v256_setall_u32(0x55555555); \
1042 const v_uint32x8 m2 = v256_setall_u32(0x33333333); \
1043 const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \
1044 v_uint32x8 p = v_reinterpret_as_u32(a); \
1045 p = ((p >> 1) & m1) + (p & m1); \
1046 p = ((p >> 2) & m2) + (p & m2); \
1047 p = ((p >> 4) & m4) + (p & m4); \
1048 p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \
1052 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
1053 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
1054 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
1055 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
1056 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
1057 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
1060 inline int v_signmask(const v_int8x32& a)
1061 { return _mm256_movemask_epi8(a.val); }
1062 inline int v_signmask(const v_uint8x32& a)
1063 { return v_signmask(v_reinterpret_as_s8(a)); }
1065 inline int v_signmask(const v_int16x16& a)
1067 v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
1068 return v_signmask(v) & 255;
1070 inline int v_signmask(const v_uint16x16& a)
1071 { return v_signmask(v_reinterpret_as_s16(a)); }
1073 inline int v_signmask(const v_int32x8& a)
1075 __m256i a16 = _mm256_packs_epi32(a.val, a.val);
1076 v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
1077 return v_signmask(v) & 15;
1079 inline int v_signmask(const v_uint32x8& a)
1080 { return v_signmask(v_reinterpret_as_s32(a)); }
1082 inline int v_signmask(const v_float32x8& a)
1083 { return _mm256_movemask_ps(a.val); }
1084 inline int v_signmask(const v_float64x4& a)
1085 { return _mm256_movemask_pd(a.val); }
1088 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \
1089 inline bool v_check_all(const _Tpvec& a) \
1091 int mask = v_signmask(v_reinterpret_as_s8(a)); \
1092 return and_op(mask, allmask) == allmask; \
1094 inline bool v_check_any(const _Tpvec& a) \
1096 int mask = v_signmask(v_reinterpret_as_s8(a)); \
1097 return and_op(mask, allmask) != 0; \
1100 OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, OPENCV_HAL_1ST, -1)
1101 OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, OPENCV_HAL_1ST, -1)
1102 OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
1103 OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16, OPENCV_HAL_AND, (int)0xaaaa)
1104 OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, OPENCV_HAL_AND, (int)0x8888)
1105 OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, OPENCV_HAL_AND, (int)0x8888)
1107 #define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
1108 inline bool v_check_all(const _Tpvec& a) \
1110 int mask = v_signmask(a); \
1111 return mask == allmask; \
1113 inline bool v_check_any(const _Tpvec& a) \
1115 int mask = v_signmask(a); \
1119 OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
1120 OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
1123 ////////// Other math /////////
1125 /** Some frequent operations **/
1126 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1127 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1128 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1129 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1130 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1131 inline _Tpvec v_sqrt(const _Tpvec& x) \
1132 { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1133 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1134 { return v_fma(a, a, b * b); } \
1135 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1136 { return v_sqrt(v_fma(a, a, b*b)); }
1138 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1139 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1141 inline v_float32x8 v_invsqrt(const v_float32x8& x)
1143 v_float32x8 half = x * v256_setall_f32(0.5);
1144 v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
1145 // todo: _mm256_fnmsub_ps
1146 t *= v256_setall_f32(1.5) - ((t * t) * half);
1150 inline v_float64x4 v_invsqrt(const v_float64x4& x)
1152 return v256_setall_f64(1.) / v_sqrt(x);
1155 /** Absolute values **/
1156 #define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1157 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1158 { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1160 OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1161 OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1162 OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1164 inline v_float32x8 v_abs(const v_float32x8& x)
1165 { return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1166 inline v_float64x4 v_abs(const v_float64x4& x)
1167 { return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1169 /** Absolute difference **/
1170 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1171 { return v_add_wrap(a - b, b - a); }
1172 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1173 { return v_add_wrap(a - b, b - a); }
1174 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1175 { return v_max(a, b) - v_min(a, b); }
1177 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1179 v_int8x32 d = v_sub_wrap(a, b);
1180 v_int8x32 m = a < b;
1181 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1184 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1185 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1187 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1189 v_int32x8 d = a - b;
1190 v_int32x8 m = a < b;
1191 return v_reinterpret_as_u32((d ^ m) - m);
1194 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1195 { return v_abs(a - b); }
1197 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1198 { return v_abs(a - b); }
1200 ////////// Conversions /////////
1203 inline v_int32x8 v_round(const v_float32x8& a)
1204 { return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1206 inline v_int32x8 v_round(const v_float64x4& a)
1207 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1209 inline v_int32x8 v_trunc(const v_float32x8& a)
1210 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1212 inline v_int32x8 v_trunc(const v_float64x4& a)
1213 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1215 inline v_int32x8 v_floor(const v_float32x8& a)
1216 { return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1218 inline v_int32x8 v_floor(const v_float64x4& a)
1219 { return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1221 inline v_int32x8 v_ceil(const v_float32x8& a)
1222 { return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1224 inline v_int32x8 v_ceil(const v_float64x4& a)
1225 { return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1228 inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1229 { return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1231 inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1232 { return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1234 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1236 __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1237 return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
1240 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1241 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1243 inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1244 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1246 inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1247 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1249 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1250 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1253 inline v_float32x8 v_cvt_f32(const v_float16x16& a)
1254 { return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
1256 inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
1257 { return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
1259 inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
1261 __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
1262 return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
1266 ////////////// Lookup table access ////////////////////
1268 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1270 int CV_DECL_ALIGNED(32) idx[8];
1271 v_store_aligned(idx, idxvec);
1272 return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1273 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1276 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1278 int CV_DECL_ALIGNED(32) idx[8];
1279 v_store_aligned(idx, idxvec);
1280 return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1281 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1284 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1286 int CV_DECL_ALIGNED(32) idx[8];
1287 v_store_aligned(idx, idxvec);
1288 return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1291 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1293 int CV_DECL_ALIGNED(32) idx[8];
1294 v_store_aligned(idx, idxvec);
1295 __m128 z = _mm_setzero_ps();
1296 __m128 xy01, xy45, xy23, xy67;
1297 xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1298 xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1299 xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1300 xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1301 __m256 xy0145 = _v256_combine(xy01, xy45);
1302 xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1303 xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1304 xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1305 xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1306 __m256 xy2367 = _v256_combine(xy23, xy67);
1308 __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1309 __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1311 x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1312 y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1315 inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1317 int CV_DECL_ALIGNED(32) idx[4];
1318 v_store_low(idx, idxvec);
1319 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1320 __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1321 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1322 __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1323 __m256d xy02 = _v256_combine(xy0, xy2);
1324 __m256d xy13 = _v256_combine(xy1, xy3);
1326 x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1327 y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1330 ////////// Matrix operations /////////
1332 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1333 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1335 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1336 { return v_dotprod(a, b) + c; }
1338 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1339 v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1341 inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
1342 const v_float32x8& m1, const v_float32x8& m2,
1343 const v_float32x8& m3)
1345 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1346 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1347 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1348 v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1349 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1352 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
1353 const v_float32x8& m1, const v_float32x8& m2,
1354 const v_float32x8& a)
1356 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1357 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1358 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1359 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1362 #define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1363 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1364 const _Tpvec& a2, const _Tpvec& a3, \
1365 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1367 __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1368 __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1369 __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1370 __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1371 b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1372 b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1373 b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1374 b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1377 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1378 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1379 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1381 //////////////// Value reordering ///////////////
1384 #define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1385 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1387 b0.val = intrin(_v256_extract_low(a.val)); \
1388 b1.val = intrin(_v256_extract_high(a.val)); \
1390 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
1392 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
1393 return _Tpwvec(intrin(a)); \
1396 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
1397 OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
1398 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
1399 OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32)
1400 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
1401 OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
1403 #define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
1404 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
1406 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1407 return _Tpvec(intrin(a)); \
1410 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
1411 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
1415 inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
1416 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
1418 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
1419 { return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
1421 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
1422 { return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
1424 inline void v_pack_store(schar* ptr, const v_int16x16& a)
1425 { v_store_low(ptr, v_pack(a, a)); }
1427 inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
1428 { v_store_low(ptr, v_pack(a, a)); }
1430 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
1431 { v_store_low(ptr, v_pack_u(a, a)); }
1433 template<int n> inline
1434 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
1436 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
1437 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
1438 return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
1439 v_reinterpret_as_s16((b + delta) >> n));
1442 template<int n> inline
1443 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
1445 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
1446 v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
1449 template<int n> inline
1450 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
1452 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1453 return v_pack_u((a + delta) >> n, (b + delta) >> n);
1456 template<int n> inline
1457 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
1459 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1460 v_pack_u_store(ptr, (a + delta) >> n);
1463 template<int n> inline
1464 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
1466 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1467 return v_pack((a + delta) >> n, (b + delta) >> n);
1470 template<int n> inline
1471 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
1473 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1474 v_pack_store(ptr, (a + delta) >> n);
1478 inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
1479 { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
1481 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
1482 { return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
1484 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
1485 { return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
1487 inline void v_pack_store(short* ptr, const v_int32x8& a)
1488 { v_store_low(ptr, v_pack(a, a)); }
1490 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
1491 { v_store_low(ptr, v_pack(a, a)); }
1493 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
1494 { v_store_low(ptr, v_pack_u(a, a)); }
1497 template<int n> inline
1498 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
1500 // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
1501 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
1502 return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
1503 v_reinterpret_as_s32((b + delta) >> n));
1506 template<int n> inline
1507 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
1509 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
1510 v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
1513 template<int n> inline
1514 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
1516 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1517 return v_pack_u((a + delta) >> n, (b + delta) >> n);
1520 template<int n> inline
1521 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
1523 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1524 v_pack_u_store(ptr, (a + delta) >> n);
1527 template<int n> inline
1528 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
1530 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1531 return v_pack((a + delta) >> n, (b + delta) >> n);
1534 template<int n> inline
1535 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
1537 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1538 v_pack_store(ptr, (a + delta) >> n);
1542 // Non-saturating pack
1543 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
1545 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
1546 __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
1547 __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
1548 return v_uint32x8(_v256_shuffle_odd_64(ab));
1551 inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
1552 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1554 inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
1556 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
1557 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
1560 inline void v_pack_store(int* ptr, const v_int64x4& b)
1561 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
1563 template<int n> inline
1564 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
1566 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
1567 return v_pack((a + delta) >> n, (b + delta) >> n);
1570 template<int n> inline
1571 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
1573 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
1574 v_pack_store(ptr, (a + delta) >> n);
1577 template<int n> inline
1578 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
1580 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
1581 return v_pack((a + delta) >> n, (b + delta) >> n);
1584 template<int n> inline
1585 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
1587 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
1588 v_pack_store(ptr, (a + delta) >> n);
1592 // its up there with load and store operations
1595 #define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
1597 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1598 { return v_rotate_right<s>(a, b); }
1600 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
1601 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
1602 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
1603 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
1604 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
1605 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
1606 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
1607 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
1608 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
1609 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
1613 // its up there with load and store operations
1616 #define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
1617 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
1618 { return v256_load_deinterleave_##suffix(ptr, a, b); } \
1619 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
1620 { return v256_store_interleave_2ch(ptr, a, b); }
1622 #define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
1623 inline void v_load_deinterleave \
1624 (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \
1625 { return v256_load_deinterleave_##suffix(ptr, a, b, c); } \
1626 inline void v_store_interleave \
1627 (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \
1628 { return v256_store_interleave_##suffix(ptr, a, b, c); }
1630 #define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \
1631 inline void v_load_deinterleave \
1632 (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1633 { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \
1634 inline void v_store_interleave \
1635 (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
1636 { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
1638 #define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
1639 OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
1640 OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
1642 #define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
1643 OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
1644 OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
1648 template<typename _Tp, typename _Tpvec>
1649 inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
1652 v_zip(a, b, ab0, ab1);
1654 v_store(ptr + _Tpvec::nlanes, ab1);
1657 template<typename _Tp, typename _Tpvec>
1658 inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
1660 _Tpvec ab0 = v256_load(ptr);
1661 _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
1663 v_recombine(ab0, ab1, ab00, ab11);
1664 v256_zip(ab00, ab11, a, b);
1668 template<typename _Tp, typename _Tpvec>
1669 inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
1671 _Tpvec abc0 = v256_load(ptr);
1672 _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
1673 _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
1675 _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
1676 _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
1677 _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
1679 a = v256_unpacklo(ab0, ac1);
1680 c = v256_unpackhi(ac1, bc1);
1681 b = v256_alignr_64(bc1, ab0);
1685 template<typename _Tp, typename _Tpvec>
1686 inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
1688 _Tpvec ab0 = v256_unpacklo(a, b);
1689 _Tpvec bc1 = v256_unpackhi(b, c);
1690 _Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a));
1692 v_store(ptr, v256_combine_diagonal(ab0, ca10));
1693 v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
1694 v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
1698 template<typename _Tp, typename _Tpvec>
1699 inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
1701 _Tpvec abcd0 = v256_load(ptr);
1702 _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
1703 _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
1704 _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
1706 _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
1707 _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
1709 _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
1710 _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
1711 _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
1712 _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
1714 v256_zip(ab0, ab1, a, b);
1715 v256_zip(cd0, cd1, c, d);
1718 template<typename _Tp, typename _Tpvec>
1719 inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
1721 _Tpvec ab0, ab1, cd0, cd1;
1722 v256_zip(a, b, ab0, ab1);
1723 v256_zip(c, d, cd0, cd1);
1725 _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
1726 _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
1728 v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
1729 v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
1730 v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
1731 v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
1734 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4)
1735 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4)
1736 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
1740 inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
1742 v_float32x8 ab0 = v256_load(ptr);
1743 v_float32x8 ab1 = v256_load(ptr + 8);
1745 v_float32x8 ab0ab2, ab1ab3;
1746 v_recombine(ab0, ab1, ab0ab2, ab1ab3);
1748 a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
1749 b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
1752 template<typename _Tp, typename _Tpvec>
1753 inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
1756 v256_load_deinterleave_l8((float*)ptr, fa, fb);
1757 a.val = v_reinterpret_as_u32(fa).val;
1758 b.val = v_reinterpret_as_u32(fb).val;
1761 template<typename _Tp, typename _Tpvec>
1762 inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
1764 _Tpvec ab0, ab1, bc0, bc1;
1765 v256_zip(a, b, ab0, ab1);
1766 v256_zip(b, c, bc0, bc1);
1768 _Tpvec cazg = v256_blend<0xaa>(c, a);
1769 _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
1770 _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
1771 _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0));
1773 _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
1774 _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
1775 _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
1778 v_store(ptr + _Tpvec::nlanes, abc1);
1779 v_store(ptr + _Tpvec::nlanes * 2, abc2);
1782 inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
1784 v_float32x8 ab0, ab1, bc0, bc1;
1785 v256_zip(a, b, ab0, ab1);
1786 v256_zip(b, c, bc0, bc1);
1788 v_float32x8 cazg = v256_blend<0xaa>(c, a);
1789 v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
1790 v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
1792 v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
1793 v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
1795 v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
1796 v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
1797 v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
1800 v_store(ptr + 8, abc1);
1801 v_store(ptr + 16, abc2);
1804 template<typename _Tp, typename _Tpvec>
1805 inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
1807 _Tpvec abc02 = v256_load(ptr);
1808 _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
1809 _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
1811 _Tpvec abc2 = v256_alignr_128(abc02, abc20);
1812 _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
1814 a = v256_blend<0x92>(abc0, abc1);
1815 a = v256_blend<0x44>(a, abc2);
1817 b = v256_blend<0x24>(abc0, abc1);
1818 b = v256_blend<0x99>(b, abc2);
1820 c = v256_blend<0x49>(abc0, abc1);
1821 c = v256_blend<0x22>(c, abc2);
1823 a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
1824 b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
1825 c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
1828 template<typename _Tp, typename _Tpvec>
1829 inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
1831 _Tpvec ab0, ab1, cd0, cd1;
1832 v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
1833 v256_zip(ab0, ab1, a, b);
1834 v256_zip(cd0, cd1, c, d);
1837 template<typename _Tp, typename _Tpvec>
1838 inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
1840 _Tpvec ac0, ac1, bd0, bd1;
1841 v256_zip(a, c, ac0, ac1);
1842 v256_zip(b, d, bd0, bd1);
1844 _Tpvec abcd0, abcd1, abcd2, abcd3;
1845 v256_zip(ac0, bd0, abcd0, abcd1);
1846 v256_zip(ac1, bd1, abcd2, abcd3);
1848 _Tpvec abcd01, abcd23, abcd45, abcd67;
1849 v_recombine(abcd0, abcd1, abcd01, abcd45);
1850 v_recombine(abcd2, abcd3, abcd23, abcd67);
1852 v_store(ptr, abcd01);
1853 v_store(ptr + _Tpvec::nlanes, abcd23);
1854 v_store(ptr + _Tpvec::nlanes * 2, abcd45);
1855 v_store(ptr + _Tpvec::nlanes * 3, abcd67);
1858 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8)
1859 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8)
1860 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8)
1862 /* ******** ******** */
1864 template<typename _Tp, typename _Tpvec>
1865 inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
1867 const __m256i sep = _mm256_setr_epi8(
1868 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1869 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
1873 v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
1875 __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
1876 __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
1878 a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
1879 b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
1882 template<typename _Tp, typename _Tpvec>
1883 inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
1885 v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
1886 v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
1887 v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
1888 v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
1890 v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a));
1891 cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
1893 v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1);
1894 ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
1896 v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg);
1897 v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0);
1899 v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
1900 v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
1902 v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
1903 v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
1904 abc21 = v256_swap_halves(abc21);
1905 v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
1907 v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
1908 v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
1909 v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
1911 v_store(ptr, _Tpvec(abc0.val));
1912 v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
1913 v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
1916 template<typename _Tp, typename _Tpvec>
1917 inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
1920 template<typename _Tp, typename _Tpvec>
1921 inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
1923 _Tpvec ab0, ab1, cd0, cd1;
1924 v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
1925 v256_zip(ab0, ab1, a, b);
1926 v256_zip(cd0, cd1, c, d);
1929 template<typename _Tp, typename _Tpvec>
1930 inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
1931 { v256_store_interleave_l8(ptr, a, b, c, d); }
1933 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16)
1934 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16)
1936 /* **************** **************** */
1938 template<typename _Tp, typename _Tpvec>
1939 inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
1941 const __m256i sep = _mm256_setr_epi8(
1942 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
1943 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
1947 v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
1949 __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
1950 __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
1952 a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
1953 b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
1957 template<typename _Tp, typename _Tpvec>
1958 inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
1960 template<typename _Tp, typename _Tpvec>
1961 inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
1964 template<typename _Tp, typename _Tpvec>
1965 inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
1967 const __m256i sep = _mm256_setr_epi8(
1968 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1969 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
1972 _Tpvec abcd0, abcd1, abcd2, abcd3;
1973 v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
1974 v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
1976 __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
1977 __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
1978 __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
1979 __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
1981 __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
1982 __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
1983 __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
1984 __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
1986 a.val = _mm256_unpacklo_epi64(ab0, ab1);
1987 b.val = _mm256_unpackhi_epi64(ab0, ab1);
1988 c.val = _mm256_unpacklo_epi64(cd0, cd1);
1989 d.val = _mm256_unpackhi_epi64(cd0, cd1);
1992 template<typename _Tp, typename _Tpvec>
1993 inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
1994 { v256_store_interleave_l8(ptr, a, b, c, d); }
1996 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32)
1997 OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32)
1999 inline void v256_cleanup() { _mm256_zeroupper(); }
2001 //! @name Check SIMD256 support
2003 //! @brief Check CPU capability of SIMD operation
2004 static inline bool hasSIMD256()
2006 return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
2010 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2016 #endif // OPENCV_HAL_INTRIN_AVX_HPP