* 1. changed static const __m128/256 to const __m128/256 to avoid wierd instructions and calls inserted by compiler.
2. added universal intrinsics that wrap MOVNTPS and other such (non-temporary or "no cache" store) instructions. v_store_interleave() and v_store() got respective flags/overloaded variants
3. rewrote split & merge to use the "no cache" store instructions. It resulted in dramatic performance improvement when processing big arrays
* hopefully, fixed some test failures where 4-channel v_store_interleave() is used
* added missing implementation of the new universal intrinsics (v_store_aligned_nocache() etc.)
* fixed silly typo in the new intrinsics in intrin_vsx.hpp
* still trying to fix VSX compiler errors
* still trying to fix VSX compiler errors
* still trying to fix VSX compiler errors
* still trying to fix VSX compiler errors
// access from within opencv code more accessible
namespace cv {
+namespace hal {
+
+enum StoreMode
+{
+ STORE_UNALIGNED = 0,
+ STORE_ALIGNED = 1,
+ STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
template<typename _Tp> struct V_TypeTraits
{
};
{ _mm256_storeu_si256((__m256i*)ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm256_store_si256((__m256i*)ptr, a.val); } \
+ inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+ { _mm256_stream_si256((__m256i*)ptr, a.val); } \
+ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+ { \
+ if( mode == hal::STORE_UNALIGNED ) \
+ _mm256_storeu_si256((__m256i*)ptr, a.val); \
+ else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
+ _mm256_stream_si256((__m256i*)ptr, a.val); \
+ else \
+ _mm256_store_si256((__m256i*)ptr, a.val); \
+ } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ _mm256_storeu_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm256_store_##suffix(ptr, a.val); } \
+ inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+ { _mm256_stream_##suffix(ptr, a.val); } \
+ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+ { \
+ if( mode == hal::STORE_UNALIGNED ) \
+ _mm256_storeu_##suffix(ptr, a.val); \
+ else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
+ _mm256_stream_##suffix(ptr, a.val); \
+ else \
+ _mm256_store_##suffix(ptr, a.val); \
+ } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
- static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
__m256i p0 = _mm256_shuffle_epi8(ab0, sh);
__m256i p1 = _mm256_shuffle_epi8(ab1, sh);
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
- static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
__m256i p0 = _mm256_shuffle_epi8(ab0, sh);
__m256i p1 = _mm256_shuffle_epi8(ab1, sh);
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
- static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+ const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
- static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+ const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
- static const __m256i
+ const __m256i
sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
- static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+ const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
- static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+ const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
- static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+ const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
- static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+ const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
- static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+ const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
b0 = _mm256_shuffle_epi8(b0, sh_b);
g0 = _mm256_shuffle_epi8(g0, sh_g);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
__m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
- static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+ const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
- static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
///////////////////////////// store interleave /////////////////////////////////////
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, xy0);
- _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, xy0);
+ _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, xy0);
+ _mm256_store_si256((__m256i*)(ptr + 32), xy1);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
+ }
}
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, xy0);
- _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, xy0);
+ _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, xy0);
+ _mm256_store_si256((__m256i*)(ptr + 16), xy1);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+ }
}
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, xy0);
- _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, xy0);
+ _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, xy0);
+ _mm256_store_si256((__m256i*)(ptr + 8), xy1);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+ }
}
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, xy0);
- _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, xy0);
+ _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, xy0);
+ _mm256_store_si256((__m256i*)(ptr + 4), xy1);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, xy0);
+ _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+ }
}
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- static const __m256i sh_b = _mm256_setr_epi8(
+ const __m256i sh_b = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
- static const __m256i sh_g = _mm256_setr_epi8(
+ const __m256i sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
- static const __m256i sh_r = _mm256_setr_epi8(
+ const __m256i sh_r = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
__m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
__m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
- static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+ const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
- static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+ const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
__m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
__m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgr0);
- _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
- _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgr0);
+ _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
+ _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgr0);
+ _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
+ _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+ _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+ }
}
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
- static const __m256i sh_b = _mm256_setr_epi8(
+ const __m256i sh_b = _mm256_setr_epi8(
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
- static const __m256i sh_g = _mm256_setr_epi8(
+ const __m256i sh_g = _mm256_setr_epi8(
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
- static const __m256i sh_r = _mm256_setr_epi8(
+ const __m256i sh_r = _mm256_setr_epi8(
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
__m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
__m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
- static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+ const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
- static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+ const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
//__m256i bgr1 = p1;
__m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgr0);
- _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
- _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgr0);
+ _mm256_stream_si256((__m256i*)(ptr + 16), p1);
+ _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgr0);
+ _mm256_store_si256((__m256i*)(ptr + 16), p1);
+ _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+ }
}
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
__m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
//__m256i bgr1 = p2;
__m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgr0);
- _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
- _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgr0);
+ _mm256_stream_si256((__m256i*)(ptr + 8), p2);
+ _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgr0);
+ _mm256_store_si256((__m256i*)(ptr + 8), p2);
+ _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
+ }
+ else
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgr0);
+ _mm256_stream_si256((__m256i*)(ptr + 8), p2);
+ _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
+ }
}
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
__m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
__m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
__m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgr0);
- _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
- _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgr0);
+ _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
+ _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgr0);
+ _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
+ _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgr0);
+ _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+ }
}
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
+ const v_uint8x32& r, const v_uint8x32& a,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgra0);
- _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
- _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
- _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgra0);
+ _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
+ _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
+ _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgra0);
+ _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
+ _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
+ _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+ }
}
inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
- const v_uint16x16& r, const v_uint16x16& a )
+ const v_uint16x16& r, const v_uint16x16& a,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgra0);
- _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
- _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
- _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgra0);
+ _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
+ _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
+ _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgra0);
+ _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
+ _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
+ _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+ }
}
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
- const v_uint32x8& r, const v_uint32x8& a )
+ const v_uint32x8& r, const v_uint32x8& a,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgra0);
- _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
- _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
- _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgra0);
+ _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
+ _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
+ _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgra0);
+ _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
+ _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
+ _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+ }
}
inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
- const v_uint64x4& r, const v_uint64x4& a )
+ const v_uint64x4& r, const v_uint64x4& a,
+ hal::StoreMode mode=hal::STORE_UNALIGNED )
{
__m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
__m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
__m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
- _mm256_storeu_si256((__m256i*)ptr, bgra0);
- _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
- _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
- _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm256_stream_si256((__m256i*)ptr, bgra0);
+ _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
+ _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
+ _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm256_store_si256((__m256i*)ptr, bgra0);
+ _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
+ _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
+ _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
+ }
+ else
+ {
+ _mm256_storeu_si256((__m256i*)ptr, bgra0);
+ _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+ _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+ _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+ }
}
#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
c0 = v_reinterpret_as_##suffix0(c1); \
d0 = v_reinterpret_as_##suffix0(d1); \
} \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+ hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
- v_store_interleave((_Tp1*)ptr, a1, b1); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
} \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+ hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
- v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
- const _Tpvec0& c0, const _Tpvec0& d0 ) \
+ const _Tpvec0& c0, const _Tpvec0& d0, \
+ hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
- v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
}
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
- const v_reg<_Tp, n>& b)
+ const v_reg<_Tp, n>& b,
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
- const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+ const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{
int i, i3;
for( i = i3 = 0; i < n; i++, i3 += 3 )
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
- const v_reg<_Tp, n>& d)
+ const v_reg<_Tp, n>& d,
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
{
int i, i4;
for( i = i4 = 0; i < n; i++, i4 += 4 )
ptr[i] = a.s[i];
}
+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+ for( int i = 0; i < n; i++ )
+ ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+ for( int i = 0; i < n; i++ )
+ ptr[i] = a.s[i];
+}
+
/** @brief Combine vector from first elements of two vectors
Scheme:
{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
c.val = v.val[2]; \
d.val = v.val[3]; \
} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+ const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
_Tpvec##x3_t v; \
v.val[0] = a.val; \
vst3q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
- const v_##_Tpvec& c, const v_##_Tpvec& d) \
+ const v_##_Tpvec& c, const v_##_Tpvec& d, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
{ \
_Tpvec##x4_t v; \
v.val[0] = a.val; \
d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
} \
\
-inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
- const v_##tp##x2& b, const v_##tp##x2& c ) \
+ const v_##tp##x2& b, const v_##tp##x2& c, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
- const v_##tp##x2& c, const v_##tp##x2& d ) \
+ const v_##tp##x2& c, const v_##tp##x2& d, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
inline v_float32x4 v_invsqrt(const v_float32x4& x)
{
- static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+ const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
__m128 t = x.val;
__m128 h = _mm_mul_ps(t, _0_5);
t = _mm_rsqrt_ps(t);
inline v_float64x2 v_invsqrt(const v_float64x2& x)
{
- static const __m128d v_1 = _mm_set1_pd(1.);
+ const __m128d v_1 = _mm_set1_pd(1.);
return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
}
{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_si128((__m128i*)ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+ if( mode == hal::STORE_UNALIGNED ) \
+ _mm_storeu_si128((__m128i*)ptr, a.val); \
+ else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
+ _mm_stream_si128((__m128i*)ptr, a.val); \
+ else \
+ _mm_store_si128((__m128i*)ptr, a.val); \
+} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ _mm_storeu_##suffix(ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+ if( mode == hal::STORE_UNALIGNED ) \
+ _mm_storeu_##suffix(ptr, a.val); \
+ else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
+ _mm_stream_##suffix(ptr, a.val); \
+ else \
+ _mm_store_##suffix(ptr, a.val); \
+} \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
{
#if CV_SSE4_1
- static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
- static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+ const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+ const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
__m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
__m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
__m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
__m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
- static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
- static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
- static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+ const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+ const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+ const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
a0 = _mm_shuffle_epi8(a0, sh_b);
b0 = _mm_shuffle_epi8(b0, sh_g);
c0 = _mm_shuffle_epi8(c0, sh_r);
b.val = b0;
c.val = c0;
#elif CV_SSSE3
- static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
- static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
- static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+ const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+ const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+ const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
__m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
- static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
- static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
- static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+ const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+ const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
a0 = _mm_shuffle_epi8(a0, sh_a);
b0 = _mm_shuffle_epi8(b0, sh_b);
c0 = _mm_shuffle_epi8(c0, sh_c);
// store interleave
-inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
__m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
__m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
- _mm_storeu_si128((__m128i*)(ptr), v0);
- _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 16), v1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 16), v1);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+ }
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
- const v_uint8x16& c )
+ const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{
#if CV_SSE4_1
- static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
- static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
- static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+ const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+ const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+ const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
- static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
- static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+ const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+ const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
__m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
__m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
__m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
-
- _mm_storeu_si128((__m128i*)(ptr), v0);
- _mm_storeu_si128((__m128i*)(ptr + 16), v1);
- _mm_storeu_si128((__m128i*)(ptr + 32), v2);
#elif CV_SSSE3
- static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
- static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
- static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+ const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+ const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+ const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
__m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
t0 = _mm_alignr_epi8(c.val, t0, 5);
- __m128i s0 = _mm_shuffle_epi8(t0, m0);
+ __m128i v0 = _mm_shuffle_epi8(t0, m0);
__m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
- __m128i s1 = _mm_shuffle_epi8(t1, m1);
+ __m128i v1 = _mm_shuffle_epi8(t1, m1);
__m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
t2 = _mm_alignr_epi8(t2, a.val, 11);
- __m128i s2 = _mm_shuffle_epi8(t2, m2);
-
- _mm_storeu_si128((__m128i*)ptr, s0);
- _mm_storeu_si128((__m128i*)(ptr + 16), s1);
- _mm_storeu_si128((__m128i*)(ptr + 32), s2);
+ __m128i v2 = _mm_shuffle_epi8(t2, m2);
#else
__m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
__m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
__m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
__m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
-
- _mm_storeu_si128((__m128i*)(ptr), v0);
- _mm_storeu_si128((__m128i*)(ptr + 16), v1);
- _mm_storeu_si128((__m128i*)(ptr + 32), v2);
#endif
+
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 16), v1);
+ _mm_stream_si128((__m128i*)(ptr + 32), v2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 16), v1);
+ _mm_store_si128((__m128i*)(ptr + 32), v2);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+ }
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
- const v_uint8x16& c, const v_uint8x16& d)
+ const v_uint8x16& c, const v_uint8x16& d,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
// a0 a1 a2 a3 ....
// b0 b1 b2 b3 ....
__m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
__m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
- __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
- __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+ __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+ __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
__m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
- _mm_storeu_si128((__m128i*)ptr, v0);
- _mm_storeu_si128((__m128i*)(ptr + 16), v2);
- _mm_storeu_si128((__m128i*)(ptr + 32), v1);
- _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 16), v1);
+ _mm_stream_si128((__m128i*)(ptr + 32), v2);
+ _mm_stream_si128((__m128i*)(ptr + 48), v3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 16), v1);
+ _mm_store_si128((__m128i*)(ptr + 32), v2);
+ _mm_store_si128((__m128i*)(ptr + 48), v3);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+ _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+ }
}
-inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- __m128i t0, t1;
- t0 = _mm_unpacklo_epi16(a.val, b.val);
- t1 = _mm_unpackhi_epi16(a.val, b.val);
- _mm_storeu_si128((__m128i*)(ptr), t0);
- _mm_storeu_si128((__m128i*)(ptr + 8), t1);
+ __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
+ __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
+
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 8), v1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 8), v1);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+ }
}
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
- const v_uint16x8& b,
- const v_uint16x8& c )
+ const v_uint16x8& b, const v_uint16x8& c,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
#if CV_SSE4_1
- static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
- static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
- static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+ const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+ const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+ const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
__m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
__m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
__m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
-
- _mm_storeu_si128((__m128i*)ptr, v0);
- _mm_storeu_si128((__m128i*)(ptr + 8), v1);
- _mm_storeu_si128((__m128i*)(ptr + 16), v2);
#else
__m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
__m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
__m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
__m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
-
- _mm_storeu_si128((__m128i*)(ptr), v0);
- _mm_storeu_si128((__m128i*)(ptr + 8), v1);
- _mm_storeu_si128((__m128i*)(ptr + 16), v2);
#endif
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 8), v1);
+ _mm_stream_si128((__m128i*)(ptr + 16), v2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 8), v1);
+ _mm_store_si128((__m128i*)(ptr + 16), v2);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+ }
}
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
- const v_uint16x8& c, const v_uint16x8& d)
+ const v_uint16x8& c, const v_uint16x8& d,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
// a0 a1 a2 a3 ....
// b0 b1 b2 b3 ....
__m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
__m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
- __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
- __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+ __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+ __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
__m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
- _mm_storeu_si128((__m128i*)ptr, v0);
- _mm_storeu_si128((__m128i*)(ptr + 8), v2);
- _mm_storeu_si128((__m128i*)(ptr + 16), v1);
- _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 8), v1);
+ _mm_stream_si128((__m128i*)(ptr + 16), v2);
+ _mm_stream_si128((__m128i*)(ptr + 24), v3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 8), v1);
+ _mm_store_si128((__m128i*)(ptr + 16), v2);
+ _mm_store_si128((__m128i*)(ptr + 24), v3);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+ _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+ }
}
-inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
- __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
+ __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
+ __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
- _mm_storeu_si128((__m128i*)ptr, t0);
- _mm_storeu_si128((__m128i*)(ptr + 4), t1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 4), v1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 4), v1);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+ }
}
inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
- const v_uint32x4& c )
+ const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{
v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
__m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
__m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
- _mm_storeu_si128((__m128i*)ptr, v0);
- _mm_storeu_si128((__m128i*)(ptr + 4), v1);
- _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 4), v1);
+ _mm_stream_si128((__m128i*)(ptr + 8), v2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 4), v1);
+ _mm_store_si128((__m128i*)(ptr + 8), v2);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+ }
}
inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
- const v_uint32x4& c, const v_uint32x4& d)
+ const v_uint32x4& c, const v_uint32x4& d,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- v_uint32x4 t0, t1, t2, t3;
- v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
- v_store(ptr, t0);
- v_store(ptr + 4, t1);
- v_store(ptr + 8, t2);
- v_store(ptr + 12, t3);
+ v_uint32x4 v0, v1, v2, v3;
+ v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0.val);
+ _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
+ _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
+ _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0.val);
+ _mm_store_si128((__m128i*)(ptr + 4), v1.val);
+ _mm_store_si128((__m128i*)(ptr + 8), v2.val);
+ _mm_store_si128((__m128i*)(ptr + 12), v3.val);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0.val);
+ _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
+ _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
+ _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
+ }
}
// 2-channel, float only
-inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- // a0 a1 a2 a3 ...
- // b0 b1 b2 b3 ...
- __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
- __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
+ __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
+ __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
- _mm_storeu_ps(ptr, u0);
- _mm_storeu_ps((ptr + 4), u1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_ps(ptr, v0);
+ _mm_stream_ps(ptr + 4, v1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_ps(ptr, v0);
+ _mm_store_ps(ptr + 4, v1);
+ }
+ else
+ {
+ _mm_storeu_ps(ptr, v0);
+ _mm_storeu_ps(ptr + 4, v1);
+ }
}
-inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+ const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{
__m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
__m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
__m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
__m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
- _mm_storeu_ps(ptr + 0, v0);
- _mm_storeu_ps(ptr + 4, v1);
- _mm_storeu_ps(ptr + 8, v2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_ps(ptr, v0);
+ _mm_stream_ps(ptr + 4, v1);
+ _mm_stream_ps(ptr + 8, v2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_ps(ptr, v0);
+ _mm_store_ps(ptr + 4, v1);
+ _mm_store_ps(ptr + 8, v2);
+ }
+ else
+ {
+ _mm_storeu_ps(ptr, v0);
+ _mm_storeu_ps(ptr + 4, v1);
+ _mm_storeu_ps(ptr + 8, v2);
+ }
}
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
- const v_float32x4& c, const v_float32x4& d)
+ const v_float32x4& c, const v_float32x4& d,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
__m128 u0 = _mm_unpacklo_ps(a.val, c.val);
__m128 u1 = _mm_unpacklo_ps(b.val, d.val);
__m128 v1 = _mm_unpackhi_ps(u0, u1);
__m128 v3 = _mm_unpackhi_ps(u2, u3);
- _mm_storeu_ps(ptr + 0, v0);
- _mm_storeu_ps(ptr + 4, v1);
- _mm_storeu_ps(ptr + 8, v2);
- _mm_storeu_ps(ptr + 12, v3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_ps(ptr, v0);
+ _mm_stream_ps(ptr + 4, v1);
+ _mm_stream_ps(ptr + 8, v2);
+ _mm_stream_ps(ptr + 12, v3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_ps(ptr, v0);
+ _mm_store_ps(ptr + 4, v1);
+ _mm_store_ps(ptr + 8, v2);
+ _mm_store_ps(ptr + 12, v3);
+ }
+ else
+ {
+ _mm_storeu_ps(ptr, v0);
+ _mm_storeu_ps(ptr + 4, v1);
+ _mm_storeu_ps(ptr + 8, v2);
+ _mm_storeu_ps(ptr + 12, v3);
+ }
}
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
- __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
+ __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+ __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
- _mm_storeu_si128((__m128i*)ptr, t0);
- _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 2), v1);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 2), v1);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+ }
}
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+ const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
- __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
- __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
+ __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+ __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+ __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
- _mm_storeu_si128((__m128i*)ptr, t0);
- _mm_storeu_si128((__m128i*)(ptr + 2), t1);
- _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 2), v1);
+ _mm_stream_si128((__m128i*)(ptr + 4), v2);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 2), v1);
+ _mm_store_si128((__m128i*)(ptr + 4), v2);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+ }
}
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+ const v_uint64x2& c, const v_uint64x2& d,
+ hal::StoreMode mode = hal::STORE_UNALIGNED)
{
- __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
- __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
- __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
- __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
+ __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+ __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
+ __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
+ __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
- _mm_storeu_si128((__m128i*)ptr, t0);
- _mm_storeu_si128((__m128i*)(ptr + 2), t1);
- _mm_storeu_si128((__m128i*)(ptr + 4), t2);
- _mm_storeu_si128((__m128i*)(ptr + 6), t3);
+ if( mode == hal::STORE_ALIGNED_NOCACHE )
+ {
+ _mm_stream_si128((__m128i*)(ptr), v0);
+ _mm_stream_si128((__m128i*)(ptr + 2), v1);
+ _mm_stream_si128((__m128i*)(ptr + 4), v2);
+ _mm_stream_si128((__m128i*)(ptr + 6), v3);
+ }
+ else if( mode == hal::STORE_ALIGNED )
+ {
+ _mm_store_si128((__m128i*)(ptr), v0);
+ _mm_store_si128((__m128i*)(ptr + 2), v1);
+ _mm_store_si128((__m128i*)(ptr + 4), v2);
+ _mm_store_si128((__m128i*)(ptr + 6), v3);
+ }
+ else
+ {
+ _mm_storeu_si128((__m128i*)(ptr), v0);
+ _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+ _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+ _mm_storeu_si128((__m128i*)(ptr + 6), v3);
+ }
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
c0 = v_reinterpret_as_##suffix0(c1); \
d0 = v_reinterpret_as_##suffix0(d1); \
} \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+ hal::StoreMode mode = hal::STORE_UNALIGNED ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
- v_store_interleave((_Tp1*)ptr, a1, b1); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
} \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+ const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
- v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
- const _Tpvec0& c0, const _Tpvec0& d0 ) \
+ const _Tpvec0& c0, const _Tpvec0& d0, \
+ hal::StoreMode mode = hal::STORE_UNALIGNED ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
- v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+ v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
}
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
{ st(a.val, 0, ptr); } \
inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
{ st_a(a.val, 0, ptr); } \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
+{ st_a(a.val, 0, ptr); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vec_st_l8(a.val, ptr); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
_Tpvec& c, _Tpvec& d) \
{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
-inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
- const _Tpvec& b, const _Tpvec& c) \
+ const _Tpvec& b, const _Tpvec& c, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, c.val, ptr); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
- const _Tpvec& c, const _Tpvec& d) \
+ const _Tpvec& c, const _Tpvec& d, \
+ hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
- static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
- static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
- static const v_float32 vminval = vx_setall_f32(minval);
- static const v_float32 vmaxval = vx_setall_f32(maxval);
+ const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+ const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+ const v_float32 vminval = vx_setall_f32(minval);
+ const v_float32 vmaxval = vx_setall_f32(maxval);
- static const v_float32 vA1 = vx_setall_f32((float)A1);
- static const v_float32 vA2 = vx_setall_f32((float)A2);
- static const v_float32 vA3 = vx_setall_f32((float)A3);
- static const v_float32 vA4 = vx_setall_f32((float)A4);
+ const v_float32 vA1 = vx_setall_f32((float)A1);
+ const v_float32 vA2 = vx_setall_f32((float)A2);
+ const v_float32 vA3 = vx_setall_f32((float)A3);
+ const v_float32 vA4 = vx_setall_f32((float)A4);
- static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+ const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
bool y_aligned = (size_t)(void*)y % 32 == 0;
for( ; i < n; i += VECSZ*2 )
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
- static const v_float64 vprescale = vx_setall_f64(exp_prescale);
- static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
- static const v_float64 vminval = vx_setall_f64(minval);
- static const v_float64 vmaxval = vx_setall_f64(maxval);
-
- static const v_float64 vA1 = vx_setall_f64(A1);
- static const v_float64 vA2 = vx_setall_f64(A2);
- static const v_float64 vA3 = vx_setall_f64(A3);
- static const v_float64 vA4 = vx_setall_f64(A4);
- static const v_float64 vA5 = vx_setall_f64(A5);
-
- static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+ const v_float64 vprescale = vx_setall_f64(exp_prescale);
+ const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+ const v_float64 vminval = vx_setall_f64(minval);
+ const v_float64 vmaxval = vx_setall_f64(maxval);
+
+ const v_float64 vA1 = vx_setall_f64(A1);
+ const v_float64 vA2 = vx_setall_f64(A2);
+ const v_float64 vA3 = vx_setall_f64(A3);
+ const v_float64 vA4 = vx_setall_f64(A4);
+ const v_float64 vA5 = vx_setall_f64(A5);
+
+ const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
bool y_aligned = (size_t)(void*)y % 32 == 0;
for( ; i < n; i += VECSZ*2 )
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
- static const v_float32 vln2 = vx_setall_f32((float)ln_2);
- static const v_float32 v1 = vx_setall_f32(1.f);
- static const v_float32 vshift = vx_setall_f32(-1.f/512);
+ const v_float32 vln2 = vx_setall_f32((float)ln_2);
+ const v_float32 v1 = vx_setall_f32(1.f);
+ const v_float32 vshift = vx_setall_f32(-1.f/512);
- static const v_float32 vA0 = vx_setall_f32(A0);
- static const v_float32 vA1 = vx_setall_f32(A1);
- static const v_float32 vA2 = vx_setall_f32(A2);
+ const v_float32 vA0 = vx_setall_f32(A0);
+ const v_float32 vA1 = vx_setall_f32(A1);
+ const v_float32 vA2 = vx_setall_f32(A2);
for( ; i < n; i += VECSZ )
{
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes;
- static const v_float64 vln2 = vx_setall_f64(ln_2);
+ const v_float64 vln2 = vx_setall_f64(ln_2);
- static const v_float64
+ const v_float64
vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
namespace cv { namespace hal {
#if CV_SIMD
+/*
+ The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
+ on IA there are instructions movntps and such to which
+ v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
+ Those instructions write directly into memory w/o touching cache
+ that results in dramatic speed improvements, especially on
+ large arrays (FullHD, 4K etc.).
+
+ Those intrinsics require the destination address to be aligned
+ by 16/32 bits (with SSE2 and AVX2, respectively).
+ So we potentially split the processing into 3 stages:
+ 1) the optional prefix part [0:i0), where we use simple unaligned stores.
+ 2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
+ But in some cases we have to use unaligned stores in this part.
+ 3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
+ to process the remaining len - VECSZ elements.
+ In principle there can be very poorly aligned data where there is no main part.
+ For that we set i0=0 and use unaligned stores for the whole array.
+*/
template<typename T, typename VecT> static void
vecmerge_( const T** src, T* dst, int len, int cn )
{
- int i;
+ const int VECSZ = VecT::nlanes;
+ int i, i0 = 0;
const T* src0 = src[0];
const T* src1 = src[1];
- const int VECSZ = VecT::nlanes;
+ int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
+ hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+ if( r != 0 )
+ {
+ mode = hal::STORE_UNALIGNED;
+ if( r % cn == 0 && len > VECSZ )
+ i0 = VECSZ - (r / cn);
+ }
+
if( cn == 2 )
{
for( i = 0; i < len; i += VECSZ )
{
- i = std::min( len - VECSZ, i );
+ if( i > len - VECSZ )
+ {
+ i = len - VECSZ;
+ mode = hal::STORE_UNALIGNED;
+ }
VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
- v_store_interleave(dst + i*cn, a, b);
+ v_store_interleave(dst + i*cn, a, b, mode);
+ if( i < i0 )
+ {
+ i = i0 - VECSZ;
+ mode = hal::STORE_ALIGNED_NOCACHE;
+ }
}
}
else if( cn == 3 )
const T* src2 = src[2];
for( i = 0; i < len; i += VECSZ )
{
- i = std::min( len - VECSZ, i );
+ if( i > len - VECSZ )
+ {
+ i = len - VECSZ;
+ mode = hal::STORE_UNALIGNED;
+ }
VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
- v_store_interleave(dst + i*cn, a, b, c);
+ v_store_interleave(dst + i*cn, a, b, c, mode);
+ if( i < i0 )
+ {
+ i = i0 - VECSZ;
+ mode = hal::STORE_ALIGNED_NOCACHE;
+ }
}
}
else
const T* src3 = src[3];
for( i = 0; i < len; i += VECSZ )
{
- i = std::min( len - VECSZ, i );
+ if( i > len - VECSZ )
+ {
+ i = len - VECSZ;
+ mode = hal::STORE_UNALIGNED;
+ }
VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
- v_store_interleave(dst + i*cn, a, b, c, d);
+ v_store_interleave(dst + i*cn, a, b, c, d, mode);
+ if( i < i0 )
+ {
+ i = i0 - VECSZ;
+ mode = hal::STORE_ALIGNED_NOCACHE;
+ }
}
}
vx_cleanup();
namespace cv { namespace hal {
#if CV_SIMD
+// see the comments for vecmerge_ in merge.cpp
template<typename T, typename VecT> static void
vecsplit_( const T* src, T** dst, int len, int cn )
{
- int i;
+ const int VECSZ = VecT::nlanes;
+ int i, i0 = 0;
T* dst0 = dst[0];
T* dst1 = dst[1];
- const int VECSZ = VecT::nlanes;
+ int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
+ int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
+ int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
+ int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
+
+ hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+ if( (r0|r1|r2|r3) != 0 )
+ {
+ mode = hal::STORE_UNALIGNED;
+ if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
+ i0 = VECSZ - (r0 / cn);
+ }
+
if( cn == 2 )
{
for( i = 0; i < len; i += VECSZ )
{
- i = std::min( len - VECSZ, i );
+ if( i > len - VECSZ )
+ {
+ i = len - VECSZ;
+ mode = hal::STORE_UNALIGNED;
+ }
VecT a, b;
v_load_deinterleave(src + i*cn, a, b);
- v_store(dst0 + i, a);
- v_store(dst1 + i, b);
+ v_store(dst0 + i, a, mode);
+ v_store(dst1 + i, b, mode);
+ if( i < i0 )
+ {
+ i = i0 - VECSZ;
+ mode = hal::STORE_ALIGNED_NOCACHE;
+ }
}
}
else if( cn == 3 )
T* dst2 = dst[2];
for( i = 0; i < len; i += VECSZ )
{
- i = std::min( len - VECSZ, i );
+ if( i > len - VECSZ )
+ {
+ i = len - VECSZ;
+ mode = hal::STORE_UNALIGNED;
+ }
VecT a, b, c;
v_load_deinterleave(src + i*cn, a, b, c);
- v_store(dst0 + i, a);
- v_store(dst1 + i, b);
- v_store(dst2 + i, c);
+ v_store(dst0 + i, a, mode);
+ v_store(dst1 + i, b, mode);
+ v_store(dst2 + i, c, mode);
+ if( i < i0 )
+ {
+ i = i0 - VECSZ;
+ mode = hal::STORE_ALIGNED_NOCACHE;
+ }
}
}
else
T* dst3 = dst[3];
for( i = 0; i < len; i += VECSZ )
{
- i = std::min( len - VECSZ, i );
+ if( i > len - VECSZ )
+ {
+ i = len - VECSZ;
+ mode = hal::STORE_UNALIGNED;
+ }
VecT a, b, c, d;
v_load_deinterleave(src + i*cn, a, b, c, d);
- v_store(dst0 + i, a);
- v_store(dst1 + i, b);
- v_store(dst2 + i, c);
- v_store(dst3 + i, d);
+ v_store(dst0 + i, a, mode);
+ v_store(dst1 + i, b, mode);
+ v_store(dst2 + i, c, mode);
+ v_store(dst3 + i, d, mode);
+ if( i < i0 )
+ {
+ i = i0 - VECSZ;
+ mode = hal::STORE_ALIGNED_NOCACHE;
+ }
}
}
vx_cleanup();