From 9c7040802cf3001ffee551d563640ee6ab2af1dd Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 24 Jul 2018 17:27:56 +0300 Subject: [PATCH] converted split() & merge() to wide univ intrinsics (#12044) * fixed/updated v_load_deinterleave and v_store_interleave intrinsics; modified split() and merge() functions to use those intrinsics * fixed a few compile errors and bug in v_load_deinterleave(ptr, v_uint32x4& a, v_uint32x4& b) * fixed few more compile errors --- .../core/include/opencv2/core/hal/intrin_avx.hpp | 822 +++++++++++++-------- .../core/include/opencv2/core/hal/intrin_neon.hpp | 77 ++ .../core/include/opencv2/core/hal/intrin_sse.hpp | 327 +++++--- .../core/include/opencv2/core/hal/intrin_vsx.hpp | 2 + modules/core/src/merge.cpp | 347 ++------- modules/core/src/split.cpp | 361 ++------- 6 files changed, 941 insertions(+), 995 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 8654f4f..4ea66f5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1609,392 +1609,592 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8) OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4) -/** Reinterpret **/ -// its up there with load and store operations - -/* de&interleave */ -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \ - inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \ - { return v256_load_deinterleave_##suffix(ptr, a, b); } \ - inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \ - { return v256_store_interleave_2ch(ptr, a, b); } +///////////////////// load deinterleave ///////////////////////////// -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \ - inline void v_load_deinterleave \ - (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \ - { return v256_load_deinterleave_##suffix(ptr, a, b, c); } \ - inline void v_store_interleave \ - (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \ - { return v256_store_interleave_##suffix(ptr, a, b, c); } +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \ - inline void v_load_deinterleave \ - (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \ - { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \ - inline void v_store_interleave \ - (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \ - { return v256_store_interleave_##suffix(ptr, a, b, c, d); } + static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m256i p0 = _mm256_shuffle_epi8(ab0, sh); + __m256i p1 = _mm256_shuffle_epi8(ab1, sh); + __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint8x32(a0); + b = v_uint8x32(b0); +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + + static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + __m256i p0 = _mm256_shuffle_epi8(ab0, sh); + __m256i p1 = _mm256_shuffle_epi8(ab1, sh); + __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint16x16(a0); + b = v_uint16x16(b0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + + const int sh = 0+2*4+1*16+3*64; + __m256i p0 = _mm256_shuffle_epi32(ab0, sh); + __m256i p1 = _mm256_shuffle_epi32(ab1, sh); + __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint32x8(a0); + b = v_uint32x8(b0); +} -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4)); -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) + __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint64x4(a0); + b = v_uint64x4(b0); +} -/* **** */ -// -template -inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r ) { - _Tpvec ab0, ab1; - v_zip(a, b, ab0, ab1); - v_store(ptr, ab0); - v_store(ptr + _Tpvec::nlanes, ab1); -} + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64)); -template -inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b) -{ - _Tpvec ab0 = v256_load(ptr); - _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec ab00, ab11; - v_recombine(ab0, ab1, ab00, ab11); - v256_zip(ab00, ab11, a, b); -} + __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); + __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); -/// -template -inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) -{ - _Tpvec abc0 = v256_load(ptr); - _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2); + static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1); - _Tpvec ab0 = v256_combine_diagonal(abc0, abc1); - _Tpvec bc1 = v256_combine_diagonal(abc1, abc2); - _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0)); + __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); + __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); + __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); - a = v256_unpacklo(ab0, ac1); - c = v256_unpackhi(ac1, bc1); - b = v256_alignr_64(bc1, ab0); -} + static const __m256i + sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, + 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13), + sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, + 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14), + sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, + 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); + b0 = _mm256_shuffle_epi8(b0, sh_b); + g0 = _mm256_shuffle_epi8(g0, sh_g); + r0 = _mm256_shuffle_epi8(r0, sh_r); + b = v_uint8x32(b0); + g = v_uint8x32(g0); + r = v_uint8x32(r0); +} -template -inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) -{ - _Tpvec ab0 = v256_unpacklo(a, b); - _Tpvec bc1 = v256_unpackhi(b, c); - _Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a)); +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + + __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); + __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); + + static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); + __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); + __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); + __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); + static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, + 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + b0 = _mm256_shuffle_epi8(b0, sh_b); + g0 = _mm256_shuffle_epi8(g0, sh_g); + r0 = _mm256_shuffle_epi8(r0, sh_r); + + b = v_uint16x16(b0); + g = v_uint16x16(g0); + r = v_uint16x16(r0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + + __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); + __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); + + __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92); + __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24); + __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92); + + b0 = _mm256_shuffle_epi32(b0, 0x6c); + g0 = _mm256_shuffle_epi32(g0, 0xb1); + r0 = _mm256_shuffle_epi32(r0, 0xc6); + + b = v_uint32x8(b0); + g = v_uint32x8(g0); + r = v_uint32x8(r0); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + + __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0); + __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0); + __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b); + __m256i b0 = _mm256_unpacklo_epi64(s01, s20r); + __m256i g0 = _mm256_alignr_epi8(s12, s01, 8); + __m256i r0 = _mm256_unpackhi_epi64(s20r, s12); + + b = v_uint64x4(b0); + g = v_uint64x4(g0); + r = v_uint64x4(r0); +} + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64)); + __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96)); + static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + + __m256i p0 = _mm256_shuffle_epi8(bgr0, sh); + __m256i p1 = _mm256_shuffle_epi8(bgr1, sh); + __m256i p2 = _mm256_shuffle_epi8(bgr2, sh); + __m256i p3 = _mm256_shuffle_epi8(bgr3, sh); + + __m256i p01l = _mm256_unpacklo_epi32(p0, p1); + __m256i p01h = _mm256_unpackhi_epi32(p0, p1); + __m256i p23l = _mm256_unpacklo_epi32(p2, p3); + __m256i p23h = _mm256_unpackhi_epi32(p2, p3); + + __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16); + __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16); + __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16); + __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16); + + __m256i b0 = _mm256_unpacklo_epi32(pll, plh); + __m256i g0 = _mm256_unpackhi_epi32(pll, plh); + __m256i r0 = _mm256_unpacklo_epi32(phl, phh); + __m256i a0 = _mm256_unpackhi_epi32(phl, phh); - v_store(ptr, v256_combine_diagonal(ab0, ca10)); - v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0)); - v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1)); + b = v_uint8x32(b0); + g = v_uint8x32(g0); + r = v_uint8x32(r0); + a = v_uint8x32(a0); } -//// -template -inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a ) { - _Tpvec abcd0 = v256_load(ptr); - _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2); - _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3); + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48)); + static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + __m256i p0 = _mm256_shuffle_epi8(bgr0, sh); + __m256i p1 = _mm256_shuffle_epi8(bgr1, sh); + __m256i p2 = _mm256_shuffle_epi8(bgr2, sh); + __m256i p3 = _mm256_shuffle_epi8(bgr3, sh); + + __m256i p01l = _mm256_unpacklo_epi32(p0, p1); + __m256i p01h = _mm256_unpackhi_epi32(p0, p1); + __m256i p23l = _mm256_unpacklo_epi32(p2, p3); + __m256i p23h = _mm256_unpackhi_epi32(p2, p3); - _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2); - _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3); + __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16); + __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16); + __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16); + __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16); - _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0); - _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1); - _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2); - _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3); + __m256i b0 = _mm256_unpacklo_epi32(pll, plh); + __m256i g0 = _mm256_unpackhi_epi32(pll, plh); + __m256i r0 = _mm256_unpacklo_epi32(phl, phh); + __m256i a0 = _mm256_unpackhi_epi32(phl, phh); - v256_zip(ab0, ab1, a, b); - v256_zip(cd0, cd1, c, d); + b = v_uint16x16(b0); + g = v_uint16x16(g0); + r = v_uint16x16(r0); + a = v_uint16x16(a0); } -template -inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a ) { - _Tpvec ab0, ab1, cd0, cd1; - v256_zip(a, b, ab0, ab1); - v256_zip(c, d, cd0, cd1); - - _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0); - _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1); - - v_store(ptr, v256_combine_diagonal(ab0, ab0cd0)); - v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1)); - v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0)); - v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1)); -} + __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24)); -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4) + __m256i p01l = _mm256_unpacklo_epi32(p0, p1); + __m256i p01h = _mm256_unpackhi_epi32(p0, p1); + __m256i p23l = _mm256_unpacklo_epi32(p2, p3); + __m256i p23h = _mm256_unpackhi_epi32(p2, p3); -/* **** **** */ -// -inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b) -{ - v_float32x8 ab0 = v256_load(ptr); - v_float32x8 ab1 = v256_load(ptr + 8); + __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16); + __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16); + __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16); + __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16); - v_float32x8 ab0ab2, ab1ab3; - v_recombine(ab0, ab1, ab0ab2, ab1ab3); + __m256i b0 = _mm256_unpacklo_epi32(pll, plh); + __m256i g0 = _mm256_unpackhi_epi32(pll, plh); + __m256i r0 = _mm256_unpacklo_epi32(phl, phh); + __m256i a0 = _mm256_unpackhi_epi32(phl, phh); - a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0)); - b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1)); + b = v_uint32x8(b0); + g = v_uint32x8(g0); + r = v_uint32x8(r0); + a = v_uint32x8(a0); } -template -inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a ) { - v_float32x8 fa, fb; - v256_load_deinterleave_l8((float*)ptr, fa, fb); - a.val = v_reinterpret_as_u32(fa).val; - b.val = v_reinterpret_as_u32(fb).val; -} -/// -template -inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) -{ - _Tpvec ab0, ab1, bc0, bc1; - v256_zip(a, b, ab0, ab1); - v256_zip(b, c, bc0, bc1); + __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4)); + __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12)); - _Tpvec cazg = v256_blend<0xaa>(c, a); - _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val)); - _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val)); - _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0)); + __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16); + __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16); + __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16); + __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16); - _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); - _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); - _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2); + __m256i b0 = _mm256_unpacklo_epi64(l02, l13); + __m256i g0 = _mm256_unpackhi_epi64(l02, l13); + __m256i r0 = _mm256_unpacklo_epi64(h02, h13); + __m256i a0 = _mm256_unpackhi_epi64(h02, h13); - v_store(ptr, abc0); - v_store(ptr + _Tpvec::nlanes, abc1); - v_store(ptr + _Tpvec::nlanes * 2, abc2); + b = v_uint64x4(b0); + g = v_uint64x4(g0); + r = v_uint64x4(r0); + a = v_uint64x4(a0); } -inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c) -{ - v_float32x8 ab0, ab1, bc0, bc1; - v256_zip(a, b, ab0, ab1); - v256_zip(b, c, bc0, bc1); +///////////////////////////// store interleave ///////////////////////////////////// - v_float32x8 cazg = v256_blend<0xaa>(c, a); - v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0))); - v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2))); - - v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2))); - v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2); +inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y ) +{ + __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val); - v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); - v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); - v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - v_store(ptr, abc0); - v_store(ptr + 8, abc1); - v_store(ptr + 16, abc2); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 32), xy1); } -template -inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) +inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y ) { - _Tpvec abc02 = v256_load(ptr); - _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2); + __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val); - _Tpvec abc2 = v256_alignr_128(abc02, abc20); - _Tpvec abc0 = v256_combine_diagonal(abc02, abc20); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - a = v256_blend<0x92>(abc0, abc1); - a = v256_blend<0x44>(a, abc2); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 16), xy1); +} - b = v256_blend<0x24>(abc0, abc1); - b = v256_blend<0x99>(b, abc2); +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y ) +{ + __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val); - c = v256_blend<0x49>(abc0, abc1); - c = v256_blend<0x22>(c, abc2); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a); - b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b); - c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c); -} -///// -template -inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) -{ - _Tpvec ab0, ab1, cd0, cd1; - v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1); - v256_zip(ab0, ab1, a, b); - v256_zip(cd0, cd1, c, d); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 8), xy1); } -template -inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y ) { - _Tpvec ac0, ac1, bd0, bd1; - v256_zip(a, c, ac0, ac1); - v256_zip(b, d, bd0, bd1); - - _Tpvec abcd0, abcd1, abcd2, abcd3; - v256_zip(ac0, bd0, abcd0, abcd1); - v256_zip(ac1, bd1, abcd2, abcd3); + __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val); - _Tpvec abcd01, abcd23, abcd45, abcd67; - v_recombine(abcd0, abcd1, abcd01, abcd45); - v_recombine(abcd2, abcd3, abcd23, abcd67); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - v_store(ptr, abcd01); - v_store(ptr + _Tpvec::nlanes, abcd23); - v_store(ptr + _Tpvec::nlanes * 2, abcd45); - v_store(ptr + _Tpvec::nlanes * 3, abcd67); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 4), xy1); } -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8) - -/* ******** ******** */ -// -template -inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r ) { - const __m256i sep = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 - ); + static const __m256i sh_b = _mm256_setr_epi8( + 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, + 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + static const __m256i sh_g = _mm256_setr_epi8( + 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, + 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + static const __m256i sh_r = _mm256_setr_epi8( + 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, + 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); - _Tpvec ab0, ab1; - v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1); + __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b); + __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); + __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); - __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep); - __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep); - - a.val = _mm256_unpacklo_epi64(a0b0, a1b1); - b.val = _mm256_unpackhi_epi64(a0b0, a1b1); -} -/// -template -inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) -{ - v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b)); - v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b)); - v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c)); - v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c)); + static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); - v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a)); - cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg); + __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); + __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1); + __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1); - v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1); - ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1); + __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16); + __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16); - v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg); - v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0); + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1); + _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2); +} - v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1); - v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001); +inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r ) +{ + static const __m256i sh_b = _mm256_setr_epi8( + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m256i sh_g = _mm256_setr_epi8( + 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, + 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + static const __m256i sh_r = _mm256_setr_epi8( + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); - v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0)); - v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0)); - abc21 = v256_swap_halves(abc21); - v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1)); + __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b); + __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); + __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); - v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21); - v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01); - v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12); + static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); - v_store(ptr, _Tpvec(abc0.val)); - v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val)); - v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val)); -} -// todo: -template -inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&) -{} -//// -template -inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) -{ - _Tpvec ab0, ab1, cd0, cd1; - v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1); - v256_zip(ab0, ab1, a, b); - v256_zip(cd0, cd1, c, d); -} + __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); + __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1); + __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1); -template -inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) -{ v256_store_interleave_l8(ptr, a, b, c, d); } + __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16); + //__m256i bgr1 = p1; + __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16); -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16) + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 16), p1); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2); +} -/* **************** **************** */ -// -template -inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r ) { - const __m256i sep = _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 - ); + __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c); + __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1); + __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6); - _Tpvec ab0, ab1; - v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1); + __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24); + __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24); + __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24); - __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep); - __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep); + __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + //__m256i bgr1 = p2; + __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); - a.val = _mm256_unpacklo_epi64(a0b0, a1b1); - b.val = _mm256_unpackhi_epi64(a0b0, a1b1); + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 8), p2); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2); } -/// todo -template -inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&) -{} -template -inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&) -{} -//// -template -inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r ) { - const __m256i sep = _mm256_setr_epi8( - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - ); - - _Tpvec abcd0, abcd1, abcd2, abcd3; - v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1); - v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3); + __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val); + __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val); + __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc); - __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep); - __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep); - __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep); - __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep); + __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16); + __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f); + __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16); - __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1); - __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3); - __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1); - __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3); - - a.val = _mm256_unpacklo_epi64(ab0, ab1); - b.val = _mm256_unpackhi_epi64(ab0, ab1); - c.val = _mm256_unpacklo_epi64(cd0, cd1); - d.val = _mm256_unpackhi_epi64(cd0, cd1); -} - -template -inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) -{ v256_store_interleave_l8(ptr, a, b, c, d); } + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2); +} -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val); + + __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0); + __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0); + __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1); + __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1); + + __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16); + __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); + __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, + const v_uint16x16& r, const v_uint16x16& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val); + + __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0); + __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0); + __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1); + __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1); + + __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16); + __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); + __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, + const v_uint32x8& r, const v_uint32x8& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val); + + __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0); + __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0); + __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1); + __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1); + + __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16); + __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); + __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3); +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, + const v_uint64x4& r, const v_uint64x4& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val); + + __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16); + __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16); + __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3); +} + +#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ +} + +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64) inline void v256_cleanup() { _mm256_zeroupper(); } diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index fdb3ec0..d806730 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1318,6 +1318,80 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& vst4q_##suffix(ptr, v); \ } +#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \ +inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \ +{ \ + tp##x1_t a0 = vld1_##suffix(ptr); \ + tp##x1_t b0 = vld1_##suffix(ptr + 1); \ + tp##x1_t a1 = vld1_##suffix(ptr + 2); \ + tp##x1_t b1 = vld1_##suffix(ptr + 3); \ + a = v_##tp##x2(vcombine_##suffix(a0, a1)); \ + b = v_##tp##x2(vcombine_##suffix(b0, b1)); \ +} \ + \ +inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \ + v_##tp##x2& b, v_##tp##x2& c ) \ +{ \ + tp##x1_t a0 = vld1_##suffix(ptr); \ + tp##x1_t b0 = vld1_##suffix(ptr + 1); \ + tp##x1_t c0 = vld1_##suffix(ptr + 2); \ + tp##x1_t a1 = vld1_##suffix(ptr + 3); \ + tp##x1_t b1 = vld1_##suffix(ptr + 4); \ + tp##x1_t c1 = vld1_##suffix(ptr + 5); \ + a = v_##tp##x2(vcombine_##suffix(a0, a1)); \ + b = v_##tp##x2(vcombine_##suffix(b0, b1)); \ + c = v_##tp##x2(vcombine_##suffix(c0, c1)); \ +} \ + \ +inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \ + v_##tp##x2& c, v_##tp##x2& d ) \ +{ \ + tp##x1_t a0 = vld1_##suffix(ptr); \ + tp##x1_t b0 = vld1_##suffix(ptr + 1); \ + tp##x1_t c0 = vld1_##suffix(ptr + 2); \ + tp##x1_t d0 = vld1_##suffix(ptr + 3); \ + tp##x1_t a1 = vld1_##suffix(ptr + 4); \ + tp##x1_t b1 = vld1_##suffix(ptr + 5); \ + tp##x1_t c1 = vld1_##suffix(ptr + 6); \ + tp##x1_t d1 = vld1_##suffix(ptr + 7); \ + a = v_##tp##x2(vcombine_##suffix(a0, a1)); \ + b = v_##tp##x2(vcombine_##suffix(b0, b1)); \ + c = v_##tp##x2(vcombine_##suffix(c0, c1)); \ + d = v_##tp##x2(vcombine_##suffix(d0, d1)); \ +} \ + \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \ +{ \ + vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ + vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ + vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \ + vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \ +} \ + \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \ + const v_##tp##x2& b, const v_##tp##x2& c ) \ +{ \ + vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ + vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ + vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \ + vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \ + vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \ + vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \ +} \ + \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \ + const v_##tp##x2& c, const v_##tp##x2& d ) \ +{ \ + vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ + vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ + vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \ + vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \ + vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \ + vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \ + vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \ + vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \ +} + OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) @@ -1329,6 +1403,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64) #endif +OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64) +OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64) + inline v_float32x4 v_cvt_f32(const v_int32x4& a) { return v_float32x4(vcvtq_f32_s32(a.val)); diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index b79ea16..4971c77 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -58,17 +58,6 @@ namespace cv CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN -struct v_uint8x16; -struct v_int8x16; -struct v_uint16x8; -struct v_int16x8; -struct v_uint32x4; -struct v_int32x4; -struct v_float32x4; -struct v_uint64x2; -struct v_int64x2; -struct v_float64x2; - struct v_uint8x16 { typedef uchar lane_type; @@ -1660,7 +1649,7 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_N OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) -// adopted from sse_utils.hpp +// load deinterleave inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) { __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); @@ -1681,7 +1670,25 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) { -#if CV_SSSE3 +#if CV_SSE4_1 + static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + __m128i s0 = _mm_loadu_si128((const __m128i*)ptr); + __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); + __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1); + __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1); + __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1); + static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); + static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14); + static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); + a0 = _mm_shuffle_epi8(a0, sh_b); + b0 = _mm_shuffle_epi8(b0, sh_g); + c0 = _mm_shuffle_epi8(c0, sh_r); + a.val = a0; + b.val = b0; + c.val = c0; +#elif CV_SSSE3 static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14); static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11); static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6); @@ -1753,8 +1760,41 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, d.val = _mm_unpackhi_epi8(v2, v3); } +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b) +{ + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3 + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7 + + __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5 + __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7 + __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6 + __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7 + + a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7 + b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7 +} + inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) { +#if CV_SSE4_1 + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); + __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16)); + __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24); + __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24); + __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24); + + static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + a0 = _mm_shuffle_epi8(a0, sh_a); + b0 = _mm_shuffle_epi8(b0, sh_b); + c0 = _mm_shuffle_epi8(c0, sh_c); + + a.val = a0; + b.val = b0; + c.val = c0; +#else __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8)); __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16)); @@ -1770,6 +1810,7 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21)); b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22); c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22)); +#endif } inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) @@ -1795,6 +1836,18 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, d.val = _mm_unpackhi_epi16(u2, u3); } +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b) +{ + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3 + + __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2 + __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3 + + a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3 + b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3 +} + inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) { __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); @@ -1812,12 +1865,23 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) { - v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 - v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 - v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 - v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 + v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 + v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 + v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 + v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 - v_transpose4x4(u0, u1, u2, u3, a, b, c, d); + v_transpose4x4(s0, s1, s2, s3, a, b, c, d); +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 + __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 + + a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 + b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 } inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c) @@ -1853,77 +1917,43 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b d.val = _mm_unpackhi_ps(t02hi, t13hi); } -inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b) { __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); - __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); - a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1))); - b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2)); - c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2))); -} - -inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c) -{ - v_uint64x2 t0, t1, t2; - v_load_deinterleave((const uint64*)ptr, t0, t1, t2); - a = v_reinterpret_as_s64(t0); - b = v_reinterpret_as_s64(t1); - c = v_reinterpret_as_s64(t2); -} - -inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c) -{ - v_uint64x2 t0, t1, t2; - v_load_deinterleave((const uint64*)ptr, t0, t1, t2); - a = v_reinterpret_as_f64(t0); - b = v_reinterpret_as_f64(t1); - c = v_reinterpret_as_f64(t2); + a = v_uint64x2(_mm_unpacklo_epi64(t0, t1)); + b = v_uint64x2(_mm_unpackhi_epi64(t0, t1)); } -// 2-channel -inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) { - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0 + __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1 + __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1 - __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 - __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 + t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0 - a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 - b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 + a = v_uint64x2(_mm_unpacklo_epi64(t0, t1)); + b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2)); + c = v_uint64x2(_mm_unpackhi_epi64(t1, t2)); } -inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b) +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, + v_uint64x2& b, v_uint64x2& c, v_uint64x2& d) { - __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3 - __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7 - - __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5 - __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7 - __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6 - __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7 + __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 + __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0 + __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1 + __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1 - a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7 - b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7 + a = v_uint64x2(_mm_unpacklo_epi64(t0, t2)); + b = v_uint64x2(_mm_unpackhi_epi64(t0, t2)); + c = v_uint64x2(_mm_unpacklo_epi64(t1, t3)); + d = v_uint64x2(_mm_unpackhi_epi64(t1, t3)); } -inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b) -{ - v_int16x8 sa, sb; - v_load_deinterleave((const short*)ptr, sa, sb); - a = v_reinterpret_as_u16(sa); - b = v_reinterpret_as_u16(sb); -} - -inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b) -{ - __m128i t0, t1; - t0 = _mm_unpacklo_epi16(a.val, b.val); - t1 = _mm_unpackhi_epi16(a.val, b.val); - _mm_storeu_si128((__m128i*)(ptr), t0); - _mm_storeu_si128((__m128i*)(ptr + 8), t1); -} +// store interleave inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b) { @@ -1937,7 +1967,24 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c ) { -#if CV_SSSE3 +#if CV_SSE4_1 + static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); + __m128i a0 = _mm_shuffle_epi8(a.val, sh_a); + __m128i b0 = _mm_shuffle_epi8(b.val, sh_b); + __m128i c0 = _mm_shuffle_epi8(c.val, sh_c); + + static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0); + __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0); + __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0); + + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 32), v2); +#elif CV_SSSE3 static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5); static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10); static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15); @@ -2025,10 +2072,35 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 _mm_storeu_si128((__m128i*)(ptr + 48), v3); } +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b ) +{ + __m128i t0, t1; + t0 = _mm_unpacklo_epi16(a.val, b.val); + t1 = _mm_unpackhi_epi16(a.val, b.val); + _mm_storeu_si128((__m128i*)(ptr), t0); + _mm_storeu_si128((__m128i*)(ptr + 8), t1); +} + inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c ) { +#if CV_SSE4_1 + static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + __m128i a0 = _mm_shuffle_epi8(a.val, sh_a); + __m128i b0 = _mm_shuffle_epi8(b.val, sh_b); + __m128i c0 = _mm_shuffle_epi8(c.val, sh_c); + + __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24); + __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24); + __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24); + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v1); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); +#else __m128i z = _mm_setzero_si128(); __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val); @@ -2060,6 +2132,7 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, _mm_storeu_si128((__m128i*)(ptr), v0); _mm_storeu_si128((__m128i*)(ptr + 8), v1); _mm_storeu_si128((__m128i*)(ptr + 16), v2); +#endif } inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, @@ -2085,6 +2158,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16 _mm_storeu_si128((__m128i*)(ptr + 24), v3); } +inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b ) +{ + __m128i t0 = _mm_unpacklo_epi32(a.val, b.val); + __m128i t1 = _mm_unpackhi_epi32(a.val, b.val); + + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 4), t1); +} + inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c ) { @@ -2158,6 +2240,15 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 _mm_storeu_ps(ptr + 12, v3); } +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b) +{ + __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i t1 = _mm_unpackhi_epi64(a.val, b.val); + + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 2), t1); +} + inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) { __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); @@ -2169,58 +2260,72 @@ inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x _mm_storeu_si128((__m128i*)(ptr + 4), t2); } -inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c) +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d) { - v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); -} + __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i t1 = _mm_unpacklo_epi64(c.val, d.val); + __m128i t2 = _mm_unpackhi_epi64(a.val, b.val); + __m128i t3 = _mm_unpackhi_epi64(c.val, d.val); -inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) -{ - v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 2), t1); + _mm_storeu_si128((__m128i*)(ptr + 4), t2); + _mm_storeu_si128((__m128i*)(ptr + 6), t3); } -#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ -inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ - _Tpvec& b0, _Tpvec& c0 ) \ +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ { \ - _Tpuvec a1, b1, c1; \ - v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \ - a0 = v_reinterpret_as_##suffix(a1); \ - b0 = v_reinterpret_as_##suffix(b1); \ - c0 = v_reinterpret_as_##suffix(c1); \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ } \ -inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ - _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ { \ - _Tpuvec a1, b1, c1, d1; \ - v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \ - a0 = v_reinterpret_as_##suffix(a1); \ - b0 = v_reinterpret_as_##suffix(b1); \ - c0 = v_reinterpret_as_##suffix(c1); \ - d0 = v_reinterpret_as_##suffix(d1); \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1); \ } \ -inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \ - const _Tpvec& b0, const _Tpvec& c0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ { \ - _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ - _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ - _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ - v_store_interleave((_Tpu*)ptr, a1, b1, c1); \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ } \ -inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ - const _Tpvec& c0, const _Tpvec& d0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0 ) \ { \ - _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ - _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ - _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ - _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \ - v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ } OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) -//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64) inline v_float32x4 v_cvt_f32(const v_int32x4& a) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 069e957..9ad8234 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -298,6 +298,8 @@ OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4) OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4) OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4) OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2) /* Expand */ #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \ diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp index e1fe6ad..a57d3bb 100644 --- a/modules/core/src/merge.cpp +++ b/modules/core/src/merge.cpp @@ -8,223 +8,49 @@ namespace cv { namespace hal { -#if CV_NEON -template struct VMerge2; -template struct VMerge3; -template struct VMerge4; - -#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - store_func(dst, r); \ - } \ - } +#if CV_SIMD +template static void +vecmerge_( const T** src, T* dst, int len, int cn ) +{ + int i; + const T* src0 = src[0]; + const T* src1 = src[1]; -#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - store_func(dst, r); \ - } \ + const int VECSZ = VecT::nlanes; + if( cn == 2 ) + { + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a = vx_load(src0 + i), b = vx_load(src1 + i); + v_store_interleave(dst + i*cn, a, b); + } } - -#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, const data_type* src3, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - r.val[3] = load_func(src3); \ - store_func(dst, r); \ - } \ + else if( cn == 3 ) + { + const T* src2 = src[2]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i); + v_store_interleave(dst + i*cn, a, b, c); + } } - -MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); - -MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); - -MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); - -#elif CV_SSE2 - -template -struct VMerge2 -{ - VMerge2() : support(false) { } - void operator()(const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge3 -{ - VMerge3() : support(false) { } - void operator()(const T *, const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge4 -{ - VMerge4() : support(false) { } - void operator()(const T *, const T *, const T *, const T *, T *) const { } - - bool support; -}; - -#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge2() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - } \ - \ - bool support; \ -} - -#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge3() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - } \ - \ - bool support; \ -} - -#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge4() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - const data_type * src2, const data_type * src3, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ - reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ - } \ - \ - bool support; \ + else + { + CV_Assert( cn == 4 ); + const T* src2 = src[2]; + const T* src3 = src[3]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a = vx_load(src0 + i), b = vx_load(src1 + i); + VecT c = vx_load(src2 + i), d = vx_load(src3 + i); + v_store_interleave(dst + i*cn, a, b, c, d); + } + } + vx_cleanup(); } - -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); - -#if CV_SSE4_1 -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -#endif - -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); - #endif template static void @@ -242,28 +68,6 @@ merge_( const T** src, T* dst, int len, int cn ) { const T *src0 = src[0], *src1 = src[1]; i = j = 0; -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#endif for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; @@ -274,28 +78,6 @@ merge_( const T** src, T* dst, int len, int cn ) { const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; i = j = 0; -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#endif for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; @@ -307,28 +89,6 @@ merge_( const T** src, T* dst, int len, int cn ) { const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; i = j = 0; -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#endif for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; dst[j+1] = src1[i]; @@ -347,29 +107,48 @@ merge_( const T** src, T* dst, int len, int cn ) } } - void merge8u(const uchar** src, uchar* dst, int len, int cn ) { CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } void merge16u(const ushort** src, ushort* dst, int len, int cn ) { CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } void merge32s(const int** src, int* dst, int len, int cn ) { CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } void merge64s(const int64** src, int64* dst, int len, int cn ) { CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } }} // cv::hal:: diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp index 4389645..6f7b61a 100644 --- a/modules/core/src/split.cpp +++ b/modules/core/src/split.cpp @@ -8,222 +8,57 @@ namespace cv { namespace hal { -#if CV_NEON -template struct VSplit2; -template struct VSplit3; -template struct VSplit4; - -#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, \ - data_type* dst1) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - } \ - } +#if CV_SIMD +template static void +vecsplit_( const T* src, T** dst, int len, int cn ) +{ + int i; + T* dst0 = dst[0]; + T* dst1 = dst[1]; -#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - } \ + const int VECSZ = VecT::nlanes; + if( cn == 2 ) + { + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a, b; + v_load_deinterleave(src + i*cn, a, b); + v_store(dst0 + i, a); + v_store(dst1 + i, b); + } } - -#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - store_func(dst3, r.val[3]); \ - } \ + else if( cn == 3 ) + { + T* dst2 = dst[2]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a, b, c; + v_load_deinterleave(src + i*cn, a, b, c); + v_store(dst0 + i, a); + v_store(dst1 + i, b); + v_store(dst2 + i, c); + } } - -SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); - -SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); - -SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); - -#elif CV_SSE2 - -template -struct VSplit2 -{ - VSplit2() : support(false) { } - void operator()(const T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit3 -{ - VSplit3() : support(false) { } - void operator()(const T *, T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit4 -{ - VSplit4() : support(false) { } - void operator()(const T *, T *, T *, T *, T *) const { } - - bool support; -}; - -#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit2() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - } \ - \ - bool support; \ -} - -#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit3() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1, data_type * dst2) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - } \ - \ - bool support; \ -} - -#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit4() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ - data_type * dst2, data_type * dst3) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ - reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ - } \ - \ - bool support; \ + else + { + CV_Assert( cn == 4 ); + T* dst2 = dst[2]; + T* dst3 = dst[3]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a, b, c, d; + v_load_deinterleave(src + i*cn, a, b, c, d); + v_store(dst0 + i, a); + v_store(dst1 + i, b); + v_store(dst2 + i, c); + v_store(dst3 + i, d); + } + } + vx_cleanup(); } - -SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - #endif template static void @@ -250,30 +85,6 @@ split_( const T* src, T** dst, int len, int cn ) T *dst0 = dst[0], *dst1 = dst[1]; i = j = 0; -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } -#elif CV_SSE2 - if (cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } - } -#endif for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; @@ -285,31 +96,6 @@ split_( const T* src, T** dst, int len, int cn ) T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; i = j = 0; -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } -#elif CV_SSE2 - if (cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } - } -#endif for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; @@ -322,30 +108,6 @@ split_( const T* src, T** dst, int len, int cn ) T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; i = j = 0; -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } -#elif CV_SSE2 - if (cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } - } -#endif for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; dst1[i] = src[j+1]; @@ -367,25 +129,46 @@ split_( const T* src, T** dst, int len, int cn ) void split8u(const uchar* src, uchar** dst, int len, int cn ) { CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn) - split_(src, dst, len, cn); + +#if CV_SIMD + if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } void split16u(const ushort* src, ushort** dst, int len, int cn ) { CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn) - split_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } void split32s(const int* src, int** dst, int len, int cn ) { CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn) - split_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } void split64s(const int64* src, int64** dst, int len, int cn ) { CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn) - split_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } }} // cv::hal:: -- 2.7.4