OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
-/** Reinterpret **/
-// its up there with load and store operations
-
-/* de&interleave */
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
- inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
- { return v256_load_deinterleave_##suffix(ptr, a, b); } \
- inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
- { return v256_store_interleave_2ch(ptr, a, b); }
+///////////////////// load deinterleave /////////////////////////////
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
- inline void v_load_deinterleave \
- (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \
- { return v256_load_deinterleave_##suffix(ptr, a, b, c); } \
- inline void v_store_interleave \
- (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \
- { return v256_store_interleave_##suffix(ptr, a, b, c); }
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
+{
+ __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \
- inline void v_load_deinterleave \
- (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \
- { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \
- inline void v_store_interleave \
- (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
- { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
+ static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+ __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+ __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+ __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+ __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+ __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+ __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+ a = v_uint8x32(a0);
+ b = v_uint8x32(b0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+ __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+ static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+ __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+ __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+ __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+ __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+ __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+ __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+ a = v_uint16x16(a0);
+ b = v_uint16x16(b0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+ __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+ const int sh = 0+2*4+1*16+3*64;
+ __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
+ __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
+ __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+ __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+ __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+ __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+ a = v_uint32x8(a0);
+ b = v_uint32x8(b0);
+}
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
- OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
- OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+ __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
- OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
- OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
+ __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
+ __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
+ __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+ __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+ a = v_uint64x4(a0);
+ b = v_uint64x4(b0);
+}
-/* **** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
{
- _Tpvec ab0, ab1;
- v_zip(a, b, ab0, ab1);
- v_store(ptr, ab0);
- v_store(ptr + _Tpvec::nlanes, ab1);
-}
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
-{
- _Tpvec ab0 = v256_load(ptr);
- _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
- _Tpvec ab00, ab11;
- v_recombine(ab0, ab1, ab00, ab11);
- v256_zip(ab00, ab11, a, b);
-}
+ __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+ __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
-{
- _Tpvec abc0 = v256_load(ptr);
- _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
- _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
+ static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+ 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+ static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+ -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
- _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
- _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
- _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
+ __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+ __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+ __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
- a = v256_unpacklo(ab0, ac1);
- c = v256_unpackhi(ac1, bc1);
- b = v256_alignr_64(bc1, ab0);
-}
+ static const __m256i
+ sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+ 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+ sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+ 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+ sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+ 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+ b0 = _mm256_shuffle_epi8(b0, sh_b);
+ g0 = _mm256_shuffle_epi8(g0, sh_g);
+ r0 = _mm256_shuffle_epi8(r0, sh_r);
+ b = v_uint8x32(b0);
+ g = v_uint8x32(g0);
+ r = v_uint8x32(r0);
+}
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
- _Tpvec ab0 = v256_unpacklo(a, b);
- _Tpvec bc1 = v256_unpackhi(b, c);
- _Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a));
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+ __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+ __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+ static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+ 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+ static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+ -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+ __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+ __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+ __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+ static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+ 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+ 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+ static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+ 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+ b0 = _mm256_shuffle_epi8(b0, sh_b);
+ g0 = _mm256_shuffle_epi8(g0, sh_g);
+ r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+ b = v_uint16x16(b0);
+ g = v_uint16x16(g0);
+ r = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+ __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+ __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+ __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
+ __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
+ __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
+
+ b0 = _mm256_shuffle_epi32(b0, 0x6c);
+ g0 = _mm256_shuffle_epi32(g0, 0xb1);
+ r0 = _mm256_shuffle_epi32(r0, 0xc6);
+
+ b = v_uint32x8(b0);
+ g = v_uint32x8(g0);
+ r = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+ __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
+ __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
+ __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
+ __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
+ __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
+ __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
+
+ b = v_uint64x4(b0);
+ g = v_uint64x4(g0);
+ r = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
+{
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+ __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
+ static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+ __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+ __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+ __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+ __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+ __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+ __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+ __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+ __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+ __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+ __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+ __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+ __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+ __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+ __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+ __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+ __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
- v_store(ptr, v256_combine_diagonal(ab0, ca10));
- v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
- v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
+ b = v_uint8x32(b0);
+ g = v_uint8x32(g0);
+ r = v_uint8x32(r0);
+ a = v_uint8x32(a0);
}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
{
- _Tpvec abcd0 = v256_load(ptr);
- _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
- _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
- _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
+ __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+ __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+ __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
+ static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+ __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+ __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+ __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+ __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+ __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+ __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+ __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+ __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
- _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
- _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
+ __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+ __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+ __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+ __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
- _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
- _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
- _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
- _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
+ __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+ __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+ __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+ __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
- v256_zip(ab0, ab1, a, b);
- v256_zip(cd0, cd1, c, d);
+ b = v_uint16x16(b0);
+ g = v_uint16x16(g0);
+ r = v_uint16x16(r0);
+ a = v_uint16x16(a0);
}
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
{
- _Tpvec ab0, ab1, cd0, cd1;
- v256_zip(a, b, ab0, ab1);
- v256_zip(c, d, cd0, cd1);
-
- _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
- _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
-
- v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
- v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
- v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
- v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
-}
+ __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+ __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+ __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
+ __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+ __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+ __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+ __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
-/* **** **** */
-//
-inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
-{
- v_float32x8 ab0 = v256_load(ptr);
- v_float32x8 ab1 = v256_load(ptr + 8);
+ __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+ __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+ __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+ __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
- v_float32x8 ab0ab2, ab1ab3;
- v_recombine(ab0, ab1, ab0ab2, ab1ab3);
+ __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+ __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+ __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+ __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
- a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
- b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
+ b = v_uint32x8(b0);
+ g = v_uint32x8(g0);
+ r = v_uint32x8(r0);
+ a = v_uint32x8(a0);
}
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
{
- v_float32x8 fa, fb;
- v256_load_deinterleave_l8((float*)ptr, fa, fb);
- a.val = v_reinterpret_as_u32(fa).val;
- b.val = v_reinterpret_as_u32(fb).val;
-}
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
- _Tpvec ab0, ab1, bc0, bc1;
- v256_zip(a, b, ab0, ab1);
- v256_zip(b, c, bc0, bc1);
+ __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
+ __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+ __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+ __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
- _Tpvec cazg = v256_blend<0xaa>(c, a);
- _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
- _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
- _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0));
+ __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
+ __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
+ __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
+ __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
- _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
- _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
- _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+ __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
+ __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
+ __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
+ __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
- v_store(ptr, abc0);
- v_store(ptr + _Tpvec::nlanes, abc1);
- v_store(ptr + _Tpvec::nlanes * 2, abc2);
+ b = v_uint64x4(b0);
+ g = v_uint64x4(g0);
+ r = v_uint64x4(r0);
+ a = v_uint64x4(a0);
}
-inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
-{
- v_float32x8 ab0, ab1, bc0, bc1;
- v256_zip(a, b, ab0, ab1);
- v256_zip(b, c, bc0, bc1);
+///////////////////////////// store interleave /////////////////////////////////////
- v_float32x8 cazg = v256_blend<0xaa>(c, a);
- v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
- v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
-
- v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
- v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
+{
+ __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
+ __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
- v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
- v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
- v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+ __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+ __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- v_store(ptr, abc0);
- v_store(ptr + 8, abc1);
- v_store(ptr + 16, abc2);
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
}
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
{
- _Tpvec abc02 = v256_load(ptr);
- _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
- _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
+ __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
+ __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
- _Tpvec abc2 = v256_alignr_128(abc02, abc20);
- _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
+ __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+ __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- a = v256_blend<0x92>(abc0, abc1);
- a = v256_blend<0x44>(a, abc2);
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+}
- b = v256_blend<0x24>(abc0, abc1);
- b = v256_blend<0x99>(b, abc2);
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
+{
+ __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
+ __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
- c = v256_blend<0x49>(abc0, abc1);
- c = v256_blend<0x22>(c, abc2);
+ __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+ __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
- b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
- c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
-}
-/////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
-{
- _Tpvec ab0, ab1, cd0, cd1;
- v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
- v256_zip(ab0, ab1, a, b);
- v256_zip(cd0, cd1, c, d);
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
}
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
{
- _Tpvec ac0, ac1, bd0, bd1;
- v256_zip(a, c, ac0, ac1);
- v256_zip(b, d, bd0, bd1);
-
- _Tpvec abcd0, abcd1, abcd2, abcd3;
- v256_zip(ac0, bd0, abcd0, abcd1);
- v256_zip(ac1, bd1, abcd2, abcd3);
+ __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
+ __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
- _Tpvec abcd01, abcd23, abcd45, abcd67;
- v_recombine(abcd0, abcd1, abcd01, abcd45);
- v_recombine(abcd2, abcd3, abcd23, abcd67);
+ __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+ __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- v_store(ptr, abcd01);
- v_store(ptr + _Tpvec::nlanes, abcd23);
- v_store(ptr + _Tpvec::nlanes * 2, abcd45);
- v_store(ptr + _Tpvec::nlanes * 3, abcd67);
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
}
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8)
-
-/* ******** ******** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
{
- const __m256i sep = _mm256_setr_epi8(
- 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
- 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
- );
+ static const __m256i sh_b = _mm256_setr_epi8(
+ 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+ 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+ static const __m256i sh_g = _mm256_setr_epi8(
+ 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+ 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+ static const __m256i sh_r = _mm256_setr_epi8(
+ 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+ 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
- _Tpvec ab0, ab1;
- v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+ __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
+ __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
+ __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
- __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
- __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
-
- a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
- b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
-}
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
- v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
- v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
- v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
- v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
+ static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+ 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+ static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+ 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
- v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a));
- cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
+ __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+ __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+ __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
- v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1);
- ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
+ __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+ __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
+ __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
- v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg);
- v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0);
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+ _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+}
- v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
- v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
+{
+ static const __m256i sh_b = _mm256_setr_epi8(
+ 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+ 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ static const __m256i sh_g = _mm256_setr_epi8(
+ 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+ 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+ static const __m256i sh_r = _mm256_setr_epi8(
+ 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+ 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
- v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
- v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
- abc21 = v256_swap_halves(abc21);
- v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
+ __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
+ __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
+ __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
- v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
- v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
- v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
+ static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+ 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+ static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+ -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
- v_store(ptr, _Tpvec(abc0.val));
- v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
- v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
-}
-// todo:
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
-{}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
-{
- _Tpvec ab0, ab1, cd0, cd1;
- v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
- v256_zip(ab0, ab1, a, b);
- v256_zip(cd0, cd1, c, d);
-}
+ __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+ __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+ __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
-{ v256_store_interleave_l8(ptr, a, b, c, d); }
+ __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
+ //__m256i bgr1 = p1;
+ __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16)
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+}
-/* **************** **************** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
{
- const __m256i sep = _mm256_setr_epi8(
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
- );
+ __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
+ __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
+ __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);
- _Tpvec ab0, ab1;
- v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+ __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
+ __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
+ __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
- __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
- __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+ __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+ //__m256i bgr1 = p2;
+ __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
- a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
- b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
}
-/// todo
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
-{}
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
-{}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
{
- const __m256i sep = _mm256_setr_epi8(
- 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
- 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
- );
-
- _Tpvec abcd0, abcd1, abcd2, abcd3;
- v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
- v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
+ __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
+ __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
+ __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);
- __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
- __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
- __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
- __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
+ __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
+ __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
+ __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
- __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
- __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
- __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
- __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
-
- a.val = _mm256_unpacklo_epi64(ab0, ab1);
- b.val = _mm256_unpackhi_epi64(ab0, ab1);
- c.val = _mm256_unpacklo_epi64(cd0, cd1);
- d.val = _mm256_unpackhi_epi64(cd0, cd1);
-}
-
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
-{ v256_store_interleave_l8(ptr, a, b, c, d); }
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+}
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32)
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
+{
+ __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
+ __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
+ __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
+ __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
+
+ __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
+ __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
+ __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
+ __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
+
+ __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+ __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+ __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+ __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
+ const v_uint16x16& r, const v_uint16x16& a )
+{
+ __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
+ __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
+ __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
+ __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
+
+ __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
+ __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
+ __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
+ __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
+
+ __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+ __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+ __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+ __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
+ const v_uint32x8& r, const v_uint32x8& a )
+{
+ __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
+ __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
+ __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
+ __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
+
+ __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
+ __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
+ __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
+ __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
+
+ __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+ __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+ __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+ __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
+ const v_uint64x4& r, const v_uint64x4& a )
+{
+ __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
+ __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
+ __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
+ __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
+
+ __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
+ __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
+ __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
+ __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
+
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+}
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+ _Tpvec1 a1, b1; \
+ v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+ a0 = v_reinterpret_as_##suffix0(a1); \
+ b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+ _Tpvec1 a1, b1, c1; \
+ v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+ a0 = v_reinterpret_as_##suffix0(a1); \
+ b0 = v_reinterpret_as_##suffix0(b1); \
+ c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+ _Tpvec1 a1, b1, c1, d1; \
+ v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+ a0 = v_reinterpret_as_##suffix0(a1); \
+ b0 = v_reinterpret_as_##suffix0(b1); \
+ c0 = v_reinterpret_as_##suffix0(c1); \
+ d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+{ \
+ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+ v_store_interleave((_Tp1*)ptr, a1, b1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+{ \
+ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+ const _Tpvec0& c0, const _Tpvec0& d0 ) \
+{ \
+ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+ _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+}
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
inline void v256_cleanup() { _mm256_zeroupper(); }
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-struct v_uint8x16;
-struct v_int8x16;
-struct v_uint16x8;
-struct v_int16x8;
-struct v_uint32x4;
-struct v_int32x4;
-struct v_float32x4;
-struct v_uint64x2;
-struct v_int64x2;
-struct v_float64x2;
-
struct v_uint8x16
{
typedef uchar lane_type;
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
-// adopted from sse_utils.hpp
+// load deinterleave
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
{
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
{
-#if CV_SSSE3
+#if CV_SSE4_1
+ static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+ static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+ __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+ __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+ __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
+ __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
+ __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
+ static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+ static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+ static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+ a0 = _mm_shuffle_epi8(a0, sh_b);
+ b0 = _mm_shuffle_epi8(b0, sh_g);
+ c0 = _mm_shuffle_epi8(c0, sh_r);
+ a.val = a0;
+ b.val = b0;
+ c.val = c0;
+#elif CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
d.val = _mm_unpackhi_epi8(v2, v3);
}
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+ __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
+ __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+ __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+ __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+ __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+ __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+ a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+ b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
{
+#if CV_SSE4_1
+ __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
+ __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
+ __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
+ __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
+ __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
+ __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
+
+ static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+ static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+ a0 = _mm_shuffle_epi8(a0, sh_a);
+ b0 = _mm_shuffle_epi8(b0, sh_b);
+ c0 = _mm_shuffle_epi8(c0, sh_c);
+
+ a.val = a0;
+ b.val = b0;
+ c.val = c0;
+#else
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
__m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+#endif
}
inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
d.val = _mm_unpackhi_epi16(u2, u3);
}
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+ __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
+ __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
+
+ __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
+ __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
+
+ a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
+ b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
+}
+
inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
{
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
{
- v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
- v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
- v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
- v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+ v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
+ v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
+ v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
+ v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
- v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+ v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+ const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+ __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
+ __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+ a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
+ b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
d.val = _mm_unpackhi_ps(t02hi, t13hi);
}
-inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
{
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
- __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
- a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
- b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
- c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
-}
-
-inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
-{
- v_uint64x2 t0, t1, t2;
- v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
- a = v_reinterpret_as_s64(t0);
- b = v_reinterpret_as_s64(t1);
- c = v_reinterpret_as_s64(t2);
-}
-
-inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
-{
- v_uint64x2 t0, t1, t2;
- v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
- a = v_reinterpret_as_f64(t0);
- b = v_reinterpret_as_f64(t1);
- c = v_reinterpret_as_f64(t2);
+ a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+ b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
}
-// 2-channel
-inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
{
- const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+ __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
+ __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
+ __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
- __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
- __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+ t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
- a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
- b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
+ a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+ b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
+ c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
}
-inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+ v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
{
- __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
- __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
-
- __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
- __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
- __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
- __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+ __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
+ __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
+ __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
+ __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
- a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
- b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+ a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
+ b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
+ c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
+ d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
}
-inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
-{
- v_int16x8 sa, sb;
- v_load_deinterleave((const short*)ptr, sa, sb);
- a = v_reinterpret_as_u16(sa);
- b = v_reinterpret_as_u16(sb);
-}
-
-inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
-{
- __m128i t0, t1;
- t0 = _mm_unpacklo_epi16(a.val, b.val);
- t1 = _mm_unpackhi_epi16(a.val, b.val);
- _mm_storeu_si128((__m128i*)(ptr), t0);
- _mm_storeu_si128((__m128i*)(ptr + 8), t1);
-}
+// store interleave
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
{
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c )
{
-#if CV_SSSE3
+#if CV_SSE4_1
+ static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+ static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+ static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+ __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+ __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+ __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+ static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+ static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+ __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+ __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+ __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+#elif CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
_mm_storeu_si128((__m128i*)(ptr + 48), v3);
}
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
+{
+ __m128i t0, t1;
+ t0 = _mm_unpacklo_epi16(a.val, b.val);
+ t1 = _mm_unpackhi_epi16(a.val, b.val);
+ _mm_storeu_si128((__m128i*)(ptr), t0);
+ _mm_storeu_si128((__m128i*)(ptr + 8), t1);
+}
+
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
const v_uint16x8& b,
const v_uint16x8& c )
{
+#if CV_SSE4_1
+ static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+ static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+ __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+ __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+ __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+ __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
+ __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
+ __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
+
+ _mm_storeu_si128((__m128i*)ptr, v0);
+ _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+#else
__m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
__m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
+#endif
}
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
_mm_storeu_si128((__m128i*)(ptr + 24), v3);
}
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
+{
+ __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
+ __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
+
+ _mm_storeu_si128((__m128i*)ptr, t0);
+ _mm_storeu_si128((__m128i*)(ptr + 4), t1);
+}
+
inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c )
{
_mm_storeu_ps(ptr + 12, v3);
}
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
+{
+ __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+ __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
+
+ _mm_storeu_si128((__m128i*)ptr, t0);
+ _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+}
+
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
{
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
_mm_storeu_si128((__m128i*)(ptr + 4), t2);
}
-inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
{
- v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
-}
+ __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+ __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
+ __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
+ __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
-inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
-{
- v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
+ _mm_storeu_si128((__m128i*)ptr, t0);
+ _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+ _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+ _mm_storeu_si128((__m128i*)(ptr + 6), t3);
}
-#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
-inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
- _Tpvec& b0, _Tpvec& c0 ) \
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+ _Tpvec1 a1, b1; \
+ v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+ a0 = v_reinterpret_as_##suffix0(a1); \
+ b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+ _Tpvec1 a1, b1, c1; \
+ v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+ a0 = v_reinterpret_as_##suffix0(a1); \
+ b0 = v_reinterpret_as_##suffix0(b1); \
+ c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
{ \
- _Tpuvec a1, b1, c1; \
- v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
- a0 = v_reinterpret_as_##suffix(a1); \
- b0 = v_reinterpret_as_##suffix(b1); \
- c0 = v_reinterpret_as_##suffix(c1); \
+ _Tpvec1 a1, b1, c1, d1; \
+ v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+ a0 = v_reinterpret_as_##suffix0(a1); \
+ b0 = v_reinterpret_as_##suffix0(b1); \
+ c0 = v_reinterpret_as_##suffix0(c1); \
+ d0 = v_reinterpret_as_##suffix0(d1); \
} \
-inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
- _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
{ \
- _Tpuvec a1, b1, c1, d1; \
- v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
- a0 = v_reinterpret_as_##suffix(a1); \
- b0 = v_reinterpret_as_##suffix(b1); \
- c0 = v_reinterpret_as_##suffix(c1); \
- d0 = v_reinterpret_as_##suffix(d1); \
+ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+ v_store_interleave((_Tp1*)ptr, a1, b1); \
} \
-inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
- const _Tpvec& b0, const _Tpvec& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
{ \
- _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
- _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
- _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
- v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
+ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
} \
-inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
- const _Tpvec& c0, const _Tpvec& d0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+ const _Tpvec0& c0, const _Tpvec0& d0 ) \
{ \
- _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
- _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
- _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
- _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
- v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
+ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+ _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
}
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
-//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{
namespace cv { namespace hal {
-#if CV_NEON
-template<typename T> struct VMerge2;
-template<typename T> struct VMerge3;
-template<typename T> struct VMerge4;
-
-#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
- template<> \
- struct name<data_type>{ \
- void operator()(const data_type* src0, const data_type* src1, \
- data_type* dst){ \
- reg_type r; \
- r.val[0] = load_func(src0); \
- r.val[1] = load_func(src1); \
- store_func(dst, r); \
- } \
- }
+#if CV_SIMD
+template<typename T, typename VecT> static void
+vecmerge_( const T** src, T* dst, int len, int cn )
+{
+ int i;
+ const T* src0 = src[0];
+ const T* src1 = src[1];
-#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
- template<> \
- struct name<data_type>{ \
- void operator()(const data_type* src0, const data_type* src1, \
- const data_type* src2, data_type* dst){ \
- reg_type r; \
- r.val[0] = load_func(src0); \
- r.val[1] = load_func(src1); \
- r.val[2] = load_func(src2); \
- store_func(dst, r); \
- } \
+ const int VECSZ = VecT::nlanes;
+ if( cn == 2 )
+ {
+ for( i = 0; i < len; i += VECSZ )
+ {
+ i = std::min( len - VECSZ, i );
+ VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
+ v_store_interleave(dst + i*cn, a, b);
+ }
}
-
-#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
- template<> \
- struct name<data_type>{ \
- void operator()(const data_type* src0, const data_type* src1, \
- const data_type* src2, const data_type* src3, \
- data_type* dst){ \
- reg_type r; \
- r.val[0] = load_func(src0); \
- r.val[1] = load_func(src1); \
- r.val[2] = load_func(src2); \
- r.val[3] = load_func(src3); \
- store_func(dst, r); \
- } \
+ else if( cn == 3 )
+ {
+ const T* src2 = src[2];
+ for( i = 0; i < len; i += VECSZ )
+ {
+ i = std::min( len - VECSZ, i );
+ VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
+ v_store_interleave(dst + i*cn, a, b, c);
+ }
}
-
-MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
-MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
-
-MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
-MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
-
-MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
-MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VMerge2
-{
- VMerge2() : support(false) { }
- void operator()(const T *, const T *, T *) const { }
-
- bool support;
-};
-
-template <typename T>
-struct VMerge3
-{
- VMerge3() : support(false) { }
- void operator()(const T *, const T *, const T *, T *) const { }
-
- bool support;
-};
-
-template <typename T>
-struct VMerge4
-{
- VMerge4() : support(false) { }
- void operator()(const T *, const T *, const T *, const T *, T *) const { }
-
- bool support;
-};
-
-#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <> \
-struct VMerge2<data_type> \
-{ \
- enum \
- { \
- ELEMS_IN_VEC = 16 / sizeof(data_type) \
- }; \
- \
- VMerge2() \
- { \
- support = checkHardwareSupport(se); \
- } \
- \
- void operator()(const data_type * src0, const data_type * src1, \
- data_type * dst) const \
- { \
- reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
- reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
- reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
- reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
- \
- _mm_interleave(v_src0, v_src1, v_src2, v_src3); \
- \
- _mm_storeu_##flavor((cast_type *)(dst), v_src0); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
- } \
- \
- bool support; \
-}
-
-#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <> \
-struct VMerge3<data_type> \
-{ \
- enum \
- { \
- ELEMS_IN_VEC = 16 / sizeof(data_type) \
- }; \
- \
- VMerge3() \
- { \
- support = checkHardwareSupport(se); \
- } \
- \
- void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
- data_type * dst) const \
- { \
- reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
- reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
- reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
- reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
- reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
- reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
- \
- _mm_interleave(v_src0, v_src1, v_src2, \
- v_src3, v_src4, v_src5); \
- \
- _mm_storeu_##flavor((cast_type *)(dst), v_src0); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
- } \
- \
- bool support; \
-}
-
-#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <> \
-struct VMerge4<data_type> \
-{ \
- enum \
- { \
- ELEMS_IN_VEC = 16 / sizeof(data_type) \
- }; \
- \
- VMerge4() \
- { \
- support = checkHardwareSupport(se); \
- } \
- \
- void operator()(const data_type * src0, const data_type * src1, \
- const data_type * src2, const data_type * src3, \
- data_type * dst) const \
- { \
- reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
- reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
- reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
- reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
- reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
- reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
- reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
- reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
- \
- _mm_interleave(v_src0, v_src1, v_src2, v_src3, \
- v_src4, v_src5, v_src6, v_src7); \
- \
- _mm_storeu_##flavor((cast_type *)(dst), v_src0); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
- _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
- } \
- \
- bool support; \
+ else
+ {
+ CV_Assert( cn == 4 );
+ const T* src2 = src[2];
+ const T* src3 = src[3];
+ for( i = 0; i < len; i += VECSZ )
+ {
+ i = std::min( len - VECSZ, i );
+ VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
+ VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
+ v_store_interleave(dst + i*cn, a, b, c, d);
+ }
+ }
+ vx_cleanup();
}
-
-MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-
-#if CV_SSE4_1
-MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-#endif
-
-MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-
#endif
template<typename T> static void
{
const T *src0 = src[0], *src1 = src[1];
i = j = 0;
-#if CV_NEON
- if(cn == 2)
- {
- int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
- int inc_j = 2 * inc_i;
-
- VMerge2<T> vmerge;
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vmerge(src0 + i, src1 + i, dst + j);
- }
-#elif CV_SSE2
- if(cn == 2)
- {
- int inc_i = 32/sizeof(T);
- int inc_j = 2 * inc_i;
-
- VMerge2<T> vmerge;
- if (vmerge.support)
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vmerge(src0 + i, src1 + i, dst + j);
- }
-#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
-#if CV_NEON
- if(cn == 3)
- {
- int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
- int inc_j = 3 * inc_i;
-
- VMerge3<T> vmerge;
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vmerge(src0 + i, src1 + i, src2 + i, dst + j);
- }
-#elif CV_SSE2
- if(cn == 3)
- {
- int inc_i = 32/sizeof(T);
- int inc_j = 3 * inc_i;
-
- VMerge3<T> vmerge;
- if (vmerge.support)
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vmerge(src0 + i, src1 + i, src2 + i, dst + j);
- }
-#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
-#if CV_NEON
- if(cn == 4)
- {
- int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
- int inc_j = 4 * inc_i;
-
- VMerge4<T> vmerge;
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
- }
-#elif CV_SSE2
- if(cn == 4)
- {
- int inc_i = 32/sizeof(T);
- int inc_j = 4 * inc_i;
-
- VMerge4<T> vmerge;
- if (vmerge.support)
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
- }
-#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
}
}
-
void merge8u(const uchar** src, uchar* dst, int len, int cn )
{
CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
- merge_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+ vecmerge_<uchar, v_uint8>(src, dst, len, cn);
+ else
+#endif
+ merge_(src, dst, len, cn);
}
void merge16u(const ushort** src, ushort* dst, int len, int cn )
{
CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
- merge_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+ vecmerge_<ushort, v_uint16>(src, dst, len, cn);
+ else
+#endif
+ merge_(src, dst, len, cn);
}
void merge32s(const int** src, int* dst, int len, int cn )
{
CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
- merge_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
+ vecmerge_<int, v_int32>(src, dst, len, cn);
+ else
+#endif
+ merge_(src, dst, len, cn);
}
void merge64s(const int64** src, int64* dst, int len, int cn )
{
CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
- merge_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+ vecmerge_<int64, v_int64>(src, dst, len, cn);
+ else
+#endif
+ merge_(src, dst, len, cn);
}
}} // cv::hal::
namespace cv { namespace hal {
-#if CV_NEON
-template<typename T> struct VSplit2;
-template<typename T> struct VSplit3;
-template<typename T> struct VSplit4;
-
-#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
- template<> \
- struct name<data_type> \
- { \
- void operator()(const data_type* src, data_type* dst0, \
- data_type* dst1) const \
- { \
- reg_type r = load_func(src); \
- store_func(dst0, r.val[0]); \
- store_func(dst1, r.val[1]); \
- } \
- }
+#if CV_SIMD
+template<typename T, typename VecT> static void
+vecsplit_( const T* src, T** dst, int len, int cn )
+{
+ int i;
+ T* dst0 = dst[0];
+ T* dst1 = dst[1];
-#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
- template<> \
- struct name<data_type> \
- { \
- void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
- data_type* dst2) const \
- { \
- reg_type r = load_func(src); \
- store_func(dst0, r.val[0]); \
- store_func(dst1, r.val[1]); \
- store_func(dst2, r.val[2]); \
- } \
+ const int VECSZ = VecT::nlanes;
+ if( cn == 2 )
+ {
+ for( i = 0; i < len; i += VECSZ )
+ {
+ i = std::min( len - VECSZ, i );
+ VecT a, b;
+ v_load_deinterleave(src + i*cn, a, b);
+ v_store(dst0 + i, a);
+ v_store(dst1 + i, b);
+ }
}
-
-#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
- template<> \
- struct name<data_type> \
- { \
- void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
- data_type* dst2, data_type* dst3) const \
- { \
- reg_type r = load_func(src); \
- store_func(dst0, r.val[0]); \
- store_func(dst1, r.val[1]); \
- store_func(dst2, r.val[2]); \
- store_func(dst3, r.val[3]); \
- } \
+ else if( cn == 3 )
+ {
+ T* dst2 = dst[2];
+ for( i = 0; i < len; i += VECSZ )
+ {
+ i = std::min( len - VECSZ, i );
+ VecT a, b, c;
+ v_load_deinterleave(src + i*cn, a, b, c);
+ v_store(dst0 + i, a);
+ v_store(dst1 + i, b);
+ v_store(dst2 + i, c);
+ }
}
-
-SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
-SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
-
-SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
-SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
-
-SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
-SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VSplit2
-{
- VSplit2() : support(false) { }
- void operator()(const T *, T *, T *) const { }
-
- bool support;
-};
-
-template <typename T>
-struct VSplit3
-{
- VSplit3() : support(false) { }
- void operator()(const T *, T *, T *, T *) const { }
-
- bool support;
-};
-
-template <typename T>
-struct VSplit4
-{
- VSplit4() : support(false) { }
- void operator()(const T *, T *, T *, T *, T *) const { }
-
- bool support;
-};
-
-#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
-template <> \
-struct VSplit2<data_type> \
-{ \
- enum \
- { \
- ELEMS_IN_VEC = 16 / sizeof(data_type) \
- }; \
- \
- VSplit2() \
- { \
- support = checkHardwareSupport(CV_CPU_SSE2); \
- } \
- \
- void operator()(const data_type * src, \
- data_type * dst0, data_type * dst1) const \
- { \
- reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
- reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
- reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
- reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
- \
- _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
- \
- _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
- _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
- _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
- _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
- } \
- \
- bool support; \
-}
-
-#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
-template <> \
-struct VSplit3<data_type> \
-{ \
- enum \
- { \
- ELEMS_IN_VEC = 16 / sizeof(data_type) \
- }; \
- \
- VSplit3() \
- { \
- support = checkHardwareSupport(CV_CPU_SSE2); \
- } \
- \
- void operator()(const data_type * src, \
- data_type * dst0, data_type * dst1, data_type * dst2) const \
- { \
- reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
- reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
- reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
- reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
- reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
- reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
- \
- _mm_deinterleave(v_src0, v_src1, v_src2, \
- v_src3, v_src4, v_src5); \
- \
- _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
- _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
- _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
- _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
- _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
- _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
- } \
- \
- bool support; \
-}
-
-#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
-template <> \
-struct VSplit4<data_type> \
-{ \
- enum \
- { \
- ELEMS_IN_VEC = 16 / sizeof(data_type) \
- }; \
- \
- VSplit4() \
- { \
- support = checkHardwareSupport(CV_CPU_SSE2); \
- } \
- \
- void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
- data_type * dst2, data_type * dst3) const \
- { \
- reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
- reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
- reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
- reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
- reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
- reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
- reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
- reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
- \
- _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
- v_src4, v_src5, v_src6, v_src7); \
- \
- _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
- _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
- _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
- _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
- _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
- _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
- _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
- _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
- } \
- \
- bool support; \
+ else
+ {
+ CV_Assert( cn == 4 );
+ T* dst2 = dst[2];
+ T* dst3 = dst[3];
+ for( i = 0; i < len; i += VECSZ )
+ {
+ i = std::min( len - VECSZ, i );
+ VecT a, b, c, d;
+ v_load_deinterleave(src + i*cn, a, b, c, d);
+ v_store(dst0 + i, a);
+ v_store(dst1 + i, b);
+ v_store(dst2 + i, c);
+ v_store(dst3 + i, d);
+ }
+ }
+ vx_cleanup();
}
-
-SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
-
-SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
-
-SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
-
#endif
template<typename T> static void
T *dst0 = dst[0], *dst1 = dst[1];
i = j = 0;
-#if CV_NEON
- if(cn == 2)
- {
- int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
- int inc_j = 2 * inc_i;
-
- VSplit2<T> vsplit;
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
- vsplit(src + j, dst0 + i, dst1 + i);
- }
-#elif CV_SSE2
- if (cn == 2)
- {
- int inc_i = 32/sizeof(T);
- int inc_j = 2 * inc_i;
-
- VSplit2<T> vsplit;
- if (vsplit.support)
- {
- for( ; i <= len - inc_i; i += inc_i, j += inc_j)
- vsplit(src + j, dst0 + i, dst1 + i);
- }
- }
-#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
i = j = 0;
-#if CV_NEON
- if(cn == 3)
- {
- int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
- int inc_j = 3 * inc_i;
-
- VSplit3<T> vsplit;
- for( ; i <= len - inc_i; i += inc_i, j += inc_j)
- vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
- }
-#elif CV_SSE2
- if (cn == 3)
- {
- int inc_i = 32/sizeof(T);
- int inc_j = 3 * inc_i;
-
- VSplit3<T> vsplit;
-
- if (vsplit.support)
- {
- for( ; i <= len - inc_i; i += inc_i, j += inc_j)
- vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
- }
- }
-#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
i = j = 0;
-#if CV_NEON
- if(cn == 4)
- {
- int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
- int inc_j = 4 * inc_i;
-
- VSplit4<T> vsplit;
- for( ; i <= len - inc_i; i += inc_i, j += inc_j)
- vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
- }
-#elif CV_SSE2
- if (cn == 4)
- {
- int inc_i = 32/sizeof(T);
- int inc_j = 4 * inc_i;
-
- VSplit4<T> vsplit;
- if (vsplit.support)
- {
- for( ; i <= len - inc_i; i += inc_i, j += inc_j)
- vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
- }
- }
-#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
void split8u(const uchar* src, uchar** dst, int len, int cn )
{
CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
- split_(src, dst, len, cn);
+
+#if CV_SIMD
+ if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+ vecsplit_<uchar, v_uint8>(src, dst, len, cn);
+ else
+#endif
+ split_(src, dst, len, cn);
}
void split16u(const ushort* src, ushort** dst, int len, int cn )
{
CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
- split_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+ vecsplit_<ushort, v_uint16>(src, dst, len, cn);
+ else
+#endif
+ split_(src, dst, len, cn);
}
void split32s(const int* src, int** dst, int len, int cn )
{
CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
- split_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
+ vecsplit_<int, v_int32>(src, dst, len, cn);
+ else
+#endif
+ split_(src, dst, len, cn);
}
void split64s(const int64* src, int64** dst, int len, int cn )
{
CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
- split_(src, dst, len, cn);
+#if CV_SIMD
+ if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+ vecsplit_<int64, v_int64>(src, dst, len, cn);
+ else
+#endif
+ split_(src, dst, len, cn);
}
}} // cv::hal::