From: Vadim Pisarevsky Date: Thu, 26 Jul 2018 09:04:28 +0000 (+0300) Subject: further improvements in split & merge; started using non-temporary store instructions... X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~597^2~22 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=43820d89b475dd32d11b441eaeef998dcd530752;p=platform%2Fupstream%2Fopencv.git further improvements in split & merge; started using non-temporary store instructions (#12063) * 1. changed static const __m128/256 to const __m128/256 to avoid wierd instructions and calls inserted by compiler. 2. added universal intrinsics that wrap MOVNTPS and other such (non-temporary or "no cache" store) instructions. v_store_interleave() and v_store() got respective flags/overloaded variants 3. rewrote split & merge to use the "no cache" store instructions. It resulted in dramatic performance improvement when processing big arrays * hopefully, fixed some test failures where 4-channel v_store_interleave() is used * added missing implementation of the new universal intrinsics (v_store_aligned_nocache() etc.) * fixed silly typo in the new intrinsics in intrin_vsx.hpp * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors * still trying to fix VSX compiler errors --- diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 031f8f3..9569e61 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -60,6 +60,17 @@ // access from within opencv code more accessible namespace cv { +namespace hal { + +enum StoreMode +{ + STORE_UNALIGNED = 0, + STORE_ALIGNED = 1, + STORE_ALIGNED_NOCACHE = 2 +}; + +} + template struct V_TypeTraits { }; diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 4ea66f5..5c2d0b6 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -304,6 +304,17 @@ inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1 { _mm256_storeu_si256((__m256i*)ptr, a.val); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ { _mm256_store_si256((__m256i*)ptr, a.val); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { _mm256_stream_si256((__m256i*)ptr, a.val); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ + { \ + if( mode == hal::STORE_UNALIGNED ) \ + _mm256_storeu_si256((__m256i*)ptr, a.val); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + _mm256_stream_si256((__m256i*)ptr, a.val); \ + else \ + _mm256_store_si256((__m256i*)ptr, a.val); \ + } \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ @@ -338,6 +349,17 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64) { _mm256_storeu_##suffix(ptr, a.val); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ { _mm256_store_##suffix(ptr, a.val); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { _mm256_stream_##suffix(ptr, a.val); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ + { \ + if( mode == hal::STORE_UNALIGNED ) \ + _mm256_storeu_##suffix(ptr, a.val); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + _mm256_stream_##suffix(ptr, a.val); \ + else \ + _mm256_store_##suffix(ptr, a.val); \ + } \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ @@ -1616,7 +1638,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); - static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m256i p0 = _mm256_shuffle_epi8(ab0, sh); __m256i p1 = _mm256_shuffle_epi8(ab1, sh); @@ -1633,7 +1655,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); - static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); __m256i p0 = _mm256_shuffle_epi8(ab0, sh); __m256i p1 = _mm256_shuffle_epi8(ab1, sh); @@ -1683,16 +1705,16 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); - static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); - static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1); __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); - static const __m256i + const __m256i sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13), sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, @@ -1717,18 +1739,18 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); - static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); - static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); - static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); - static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, + const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); - static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); b0 = _mm256_shuffle_epi8(b0, sh_b); g0 = _mm256_shuffle_epi8(g0, sh_g); @@ -1785,7 +1807,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64)); __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96)); - static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m256i p0 = _mm256_shuffle_epi8(bgr0, sh); @@ -1820,7 +1842,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48)); - static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); __m256i p0 = _mm256_shuffle_epi8(bgr0, sh); __m256i p1 = _mm256_shuffle_epi8(bgr1, sh); @@ -1901,7 +1923,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g ///////////////////////////// store interleave ///////////////////////////////////// -inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y ) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val); @@ -1909,11 +1932,25 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x3 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, xy0); - _mm256_storeu_si256((__m256i*)(ptr + 32), xy1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, xy0); + _mm256_stream_si256((__m256i*)(ptr + 32), xy1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, xy0); + _mm256_store_si256((__m256i*)(ptr + 32), xy1); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 32), xy1); + } } -inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y ) +inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val); @@ -1921,11 +1958,25 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint1 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, xy0); - _mm256_storeu_si256((__m256i*)(ptr + 16), xy1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, xy0); + _mm256_stream_si256((__m256i*)(ptr + 16), xy1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, xy0); + _mm256_store_si256((__m256i*)(ptr + 16), xy1); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 16), xy1); + } } -inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y ) +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val); @@ -1933,11 +1984,25 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, xy0); - _mm256_storeu_si256((__m256i*)(ptr + 8), xy1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, xy0); + _mm256_stream_si256((__m256i*)(ptr + 8), xy1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, xy0); + _mm256_store_si256((__m256i*)(ptr + 8), xy1); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 8), xy1); + } } -inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y ) +inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val); __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val); @@ -1945,19 +2010,33 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, xy0); - _mm256_storeu_si256((__m256i*)(ptr + 4), xy1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, xy0); + _mm256_stream_si256((__m256i*)(ptr + 4), xy1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, xy0); + _mm256_store_si256((__m256i*)(ptr + 4), xy1); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 4), xy1); + } } -inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r ) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { - static const __m256i sh_b = _mm256_setr_epi8( + const __m256i sh_b = _mm256_setr_epi8( 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); - static const __m256i sh_g = _mm256_setr_epi8( + const __m256i sh_g = _mm256_setr_epi8( 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); - static const __m256i sh_r = _mm256_setr_epi8( + const __m256i sh_r = _mm256_setr_epi8( 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); @@ -1965,9 +2044,9 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3 __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); - static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); - static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); @@ -1978,20 +2057,36 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3 __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16); __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgr0); - _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1); - _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgr0); + _mm256_stream_si256((__m256i*)(ptr + 32), bgr1); + _mm256_stream_si256((__m256i*)(ptr + 64), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgr0); + _mm256_store_si256((__m256i*)(ptr + 32), bgr1); + _mm256_store_si256((__m256i*)(ptr + 64), bgr2); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1); + _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2); + } } -inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r ) +inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { - static const __m256i sh_b = _mm256_setr_epi8( + const __m256i sh_b = _mm256_setr_epi8( 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); - static const __m256i sh_g = _mm256_setr_epi8( + const __m256i sh_g = _mm256_setr_epi8( 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); - static const __m256i sh_r = _mm256_setr_epi8( + const __m256i sh_r = _mm256_setr_epi8( 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); @@ -1999,9 +2094,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1 __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); - static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); - static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); @@ -2012,12 +2107,28 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1 //__m256i bgr1 = p1; __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgr0); - _mm256_storeu_si256((__m256i*)(ptr + 16), p1); - _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgr0); + _mm256_stream_si256((__m256i*)(ptr + 16), p1); + _mm256_stream_si256((__m256i*)(ptr + 32), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgr0); + _mm256_store_si256((__m256i*)(ptr + 16), p1); + _mm256_store_si256((__m256i*)(ptr + 32), bgr2); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 16), p1); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2); + } } -inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r ) +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c); __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1); @@ -2031,12 +2142,28 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint //__m256i bgr1 = p2; __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgr0); - _mm256_storeu_si256((__m256i*)(ptr + 8), p2); - _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgr0); + _mm256_stream_si256((__m256i*)(ptr + 8), p2); + _mm256_stream_si256((__m256i*)(ptr + 16), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgr0); + _mm256_store_si256((__m256i*)(ptr + 8), p2); + _mm256_store_si256((__m256i*)(ptr + 16), bgr2); + } + else + { + _mm256_stream_si256((__m256i*)ptr, bgr0); + _mm256_stream_si256((__m256i*)(ptr + 8), p2); + _mm256_stream_si256((__m256i*)(ptr + 16), bgr2); + } } -inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r ) +inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val); __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val); @@ -2046,12 +2173,29 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64 __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f); __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgr0); - _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1); - _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgr0); + _mm256_stream_si256((__m256i*)(ptr + 4), bgr1); + _mm256_stream_si256((__m256i*)(ptr + 8), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgr0); + _mm256_store_si256((__m256i*)(ptr + 4), bgr1); + _mm256_store_si256((__m256i*)(ptr + 8), bgr2); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2); + } } -inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a ) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, + const v_uint8x32& r, const v_uint8x32& a, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val); @@ -2068,14 +2212,32 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgra0); - _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1); - _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2); - _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgra0); + _mm256_stream_si256((__m256i*)(ptr + 32), bgra1); + _mm256_stream_si256((__m256i*)(ptr + 64), bgra2); + _mm256_stream_si256((__m256i*)(ptr + 96), bgra3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgra0); + _mm256_store_si256((__m256i*)(ptr + 32), bgra1); + _mm256_store_si256((__m256i*)(ptr + 64), bgra2); + _mm256_store_si256((__m256i*)(ptr + 96), bgra3); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3); + } } inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, - const v_uint16x16& r, const v_uint16x16& a ) + const v_uint16x16& r, const v_uint16x16& a, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val); @@ -2092,14 +2254,32 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgra0); - _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1); - _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2); - _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgra0); + _mm256_stream_si256((__m256i*)(ptr + 16), bgra1); + _mm256_stream_si256((__m256i*)(ptr + 32), bgra2); + _mm256_stream_si256((__m256i*)(ptr + 48), bgra3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgra0); + _mm256_store_si256((__m256i*)(ptr + 16), bgra1); + _mm256_store_si256((__m256i*)(ptr + 32), bgra2); + _mm256_store_si256((__m256i*)(ptr + 48), bgra3); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3); + } } inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, - const v_uint32x8& r, const v_uint32x8& a ) + const v_uint32x8& r, const v_uint32x8& a, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val); @@ -2116,14 +2296,32 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgra0); - _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1); - _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2); - _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgra0); + _mm256_stream_si256((__m256i*)(ptr + 8), bgra1); + _mm256_stream_si256((__m256i*)(ptr + 16), bgra2); + _mm256_stream_si256((__m256i*)(ptr + 24), bgra3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgra0); + _mm256_store_si256((__m256i*)(ptr + 8), bgra1); + _mm256_store_si256((__m256i*)(ptr + 16), bgra2); + _mm256_store_si256((__m256i*)(ptr + 24), bgra3); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3); + } } inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, - const v_uint64x4& r, const v_uint64x4& a ) + const v_uint64x4& r, const v_uint64x4& a, + hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val); __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val); @@ -2135,10 +2333,27 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64 __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16); __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16); - _mm256_storeu_si256((__m256i*)ptr, bgra0); - _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1); - _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2); - _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm256_stream_si256((__m256i*)ptr, bgra0); + _mm256_stream_si256((__m256i*)(ptr + 4), bgra1); + _mm256_stream_si256((__m256i*)(ptr + 8), bgra2); + _mm256_stream_si256((__m256i*)(ptr + 12), bgra3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm256_store_si256((__m256i*)ptr, bgra0); + _mm256_store_si256((__m256i*)(ptr + 4), bgra1); + _mm256_store_si256((__m256i*)(ptr + 8), bgra2); + _mm256_store_si256((__m256i*)(ptr + 12), bgra3); + } + else + { + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3); + } } #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ @@ -2166,27 +2381,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv c0 = v_reinterpret_as_##suffix0(c1); \ d0 = v_reinterpret_as_##suffix0(d1); \ } \ -inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + hal::StoreMode mode=hal::STORE_UNALIGNED ) \ { \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ - v_store_interleave((_Tp1*)ptr, a1, b1); \ + v_store_interleave((_Tp1*)ptr, a1, b1, mode); \ } \ -inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \ + hal::StoreMode mode=hal::STORE_UNALIGNED ) \ { \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ - v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \ } \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ - const _Tpvec0& c0, const _Tpvec0& d0 ) \ + const _Tpvec0& c0, const _Tpvec0& d0, \ + hal::StoreMode mode=hal::STORE_UNALIGNED ) \ { \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ - v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \ } OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 1f5f531..61d58db 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1319,7 +1319,8 @@ Scheme: For all types except 64-bit. */ template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b) + const v_reg<_Tp, n>& b, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) { int i, i2; for( i = i2 = 0; i < n; i++, i2 += 2 ) @@ -1339,7 +1340,8 @@ Scheme: For all types except 64-bit. */ template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, - const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) { int i, i3; for( i = i3 = 0; i < n; i++, i3 += 3 ) @@ -1360,7 +1362,8 @@ Scheme: For all types except 64-bit. */ template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, - const v_reg<_Tp, n>& d) + const v_reg<_Tp, n>& d, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) { int i, i4; for( i = i4 = 0; i < n; i++, i4 += 4 ) @@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) ptr[i] = a.s[i]; } +template +inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + +template +inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) +{ + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + /** @brief Combine vector from first elements of two vectors Scheme: diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index d806730..b601e3e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -864,6 +864,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \ { vst1q_##suffix(ptr, a.val); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ { vst1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ vst1q_##suffix(ptr, a.val); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ vst1q_##suffix(ptr, a.val); } \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ @@ -1292,14 +1296,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ c.val = v.val[2]; \ d.val = v.val[3]; \ } \ -inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ _Tpvec##x2_t v; \ v.val[0] = a.val; \ v.val[1] = b.val; \ vst2q_##suffix(ptr, v); \ } \ -inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ _Tpvec##x3_t v; \ v.val[0] = a.val; \ @@ -1308,7 +1314,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& vst3q_##suffix(ptr, v); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ - const v_##_Tpvec& c, const v_##_Tpvec& d) \ + const v_##_Tpvec& c, const v_##_Tpvec& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ { \ _Tpvec##x4_t v; \ v.val[0] = a.val; \ @@ -1360,7 +1367,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \ d = v_##tp##x2(vcombine_##suffix(d0, d1)); \ } \ \ -inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ @@ -1369,7 +1377,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& } \ \ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \ - const v_##tp##x2& b, const v_##tp##x2& c ) \ + const v_##tp##x2& b, const v_##tp##x2& c, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ @@ -1380,7 +1389,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \ } \ \ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \ - const v_##tp##x2& c, const v_##tp##x2& d ) \ + const v_##tp##x2& c, const v_##tp##x2& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { \ vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 4971c77..6e07940 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -788,7 +788,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x) inline v_float32x4 v_invsqrt(const v_float32x4& x) { - static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); + const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f); __m128 t = x.val; __m128 h = _mm_mul_ps(t, _0_5); t = _mm_rsqrt_ps(t); @@ -801,7 +801,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x) inline v_float64x2 v_invsqrt(const v_float64x2& x) { - static const __m128d v_1 = _mm_set1_pd(1.); + const __m128d v_1 = _mm_set1_pd(1.); return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val))); } @@ -1261,6 +1261,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \ { _mm_storeu_si128((__m128i*)ptr, a.val); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ { _mm_store_si128((__m128i*)ptr, a.val); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ _mm_stream_si128((__m128i*)ptr, a.val); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ +{ \ + if( mode == hal::STORE_UNALIGNED ) \ + _mm_storeu_si128((__m128i*)ptr, a.val); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + _mm_stream_si128((__m128i*)ptr, a.val); \ + else \ + _mm_store_si128((__m128i*)ptr, a.val); \ +} \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ { _mm_storel_epi64((__m128i*)ptr, a.val); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ @@ -1292,6 +1303,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \ { _mm_storeu_##suffix(ptr, a.val); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ { _mm_store_##suffix(ptr, a.val); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ _mm_stream_##suffix(ptr, a.val); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ +{ \ + if( mode == hal::STORE_UNALIGNED ) \ + _mm_storeu_##suffix(ptr, a.val); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + _mm_stream_##suffix(ptr, a.val); \ + else \ + _mm_store_##suffix(ptr, a.val); \ +} \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ @@ -1671,17 +1693,17 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) { #if CV_SSE4_1 - static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); - static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); __m128i s0 = _mm_loadu_si128((const __m128i*)ptr); __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1); __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1); __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1); - static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); - static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14); - static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); + const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); + const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14); + const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); a0 = _mm_shuffle_epi8(a0, sh_b); b0 = _mm_shuffle_epi8(b0, sh_g); c0 = _mm_shuffle_epi8(c0, sh_r); @@ -1689,9 +1711,9 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, b.val = b0; c.val = c0; #elif CV_SSSE3 - static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14); - static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11); - static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6); + const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14); + const __m128i m1 = _mm_alignr_epi8(m0, m0, 11); + const __m128i m2 = _mm_alignr_epi8(m0, m0, 6); __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); @@ -1784,9 +1806,9 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24); __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24); - static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); - static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); - static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); a0 = _mm_shuffle_epi8(a0, sh_a); b0 = _mm_shuffle_epi8(b0, sh_b); c0 = _mm_shuffle_epi8(c0, sh_c); @@ -1955,55 +1977,61 @@ inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, // store interleave -inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b) +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + hal::StoreMode mode = hal::STORE_UNALIGNED) { __m128i v0 = _mm_unpacklo_epi8(a.val, b.val); __m128i v1 = _mm_unpackhi_epi8(a.val, b.val); - _mm_storeu_si128((__m128i*)(ptr), v0); - _mm_storeu_si128((__m128i*)(ptr + 16), v1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 16), v1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 16), v1); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + } } inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, - const v_uint8x16& c ) + const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED) { #if CV_SSE4_1 - static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); - static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); - static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); + const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); __m128i a0 = _mm_shuffle_epi8(a.val, sh_a); __m128i b0 = _mm_shuffle_epi8(b.val, sh_b); __m128i c0 = _mm_shuffle_epi8(c.val, sh_c); - static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); - static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0); __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0); __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0); - - _mm_storeu_si128((__m128i*)(ptr), v0); - _mm_storeu_si128((__m128i*)(ptr + 16), v1); - _mm_storeu_si128((__m128i*)(ptr + 32), v2); #elif CV_SSSE3 - static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5); - static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10); - static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15); + const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5); + const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10); + const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15); __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5); t0 = _mm_alignr_epi8(c.val, t0, 5); - __m128i s0 = _mm_shuffle_epi8(t0, m0); + __m128i v0 = _mm_shuffle_epi8(t0, m0); __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6); t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5); - __m128i s1 = _mm_shuffle_epi8(t1, m1); + __m128i v1 = _mm_shuffle_epi8(t1, m1); __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11); t2 = _mm_alignr_epi8(t2, a.val, 11); - __m128i s2 = _mm_shuffle_epi8(t2, m2); - - _mm_storeu_si128((__m128i*)ptr, s0); - _mm_storeu_si128((__m128i*)(ptr + 16), s1); - _mm_storeu_si128((__m128i*)(ptr + 32), s2); + __m128i v2 = _mm_shuffle_epi8(t2, m2); #else __m128i z = _mm_setzero_si128(); __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); @@ -2042,15 +2070,31 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); - - _mm_storeu_si128((__m128i*)(ptr), v0); - _mm_storeu_si128((__m128i*)(ptr + 16), v1); - _mm_storeu_si128((__m128i*)(ptr + 32), v2); #endif + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 16), v1); + _mm_stream_si128((__m128i*)(ptr + 32), v2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 16), v1); + _mm_store_si128((__m128i*)(ptr + 32), v2); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 32), v2); + } } inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, - const v_uint8x16& c, const v_uint8x16& d) + const v_uint8x16& c, const v_uint8x16& d, + hal::StoreMode mode = hal::STORE_UNALIGNED) { // a0 a1 a2 a3 .... // b0 b1 b2 b3 .... @@ -2062,33 +2106,64 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ... __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ... - __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... - __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... + __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ... + __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ... __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ... - _mm_storeu_si128((__m128i*)ptr, v0); - _mm_storeu_si128((__m128i*)(ptr + 16), v2); - _mm_storeu_si128((__m128i*)(ptr + 32), v1); - _mm_storeu_si128((__m128i*)(ptr + 48), v3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 16), v1); + _mm_stream_si128((__m128i*)(ptr + 32), v2); + _mm_stream_si128((__m128i*)(ptr + 48), v3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 16), v1); + _mm_store_si128((__m128i*)(ptr + 32), v2); + _mm_store_si128((__m128i*)(ptr + 48), v3); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 32), v2); + _mm_storeu_si128((__m128i*)(ptr + 48), v3); + } } -inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b ) +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + hal::StoreMode mode = hal::STORE_UNALIGNED) { - __m128i t0, t1; - t0 = _mm_unpacklo_epi16(a.val, b.val); - t1 = _mm_unpackhi_epi16(a.val, b.val); - _mm_storeu_si128((__m128i*)(ptr), t0); - _mm_storeu_si128((__m128i*)(ptr + 8), t1); + __m128i v0 = _mm_unpacklo_epi16(a.val, b.val); + __m128i v1 = _mm_unpackhi_epi16(a.val, b.val); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 8), v1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 8), v1); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v1); + } } inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, - const v_uint16x8& b, - const v_uint16x8& c ) + const v_uint16x8& b, const v_uint16x8& c, + hal::StoreMode mode = hal::STORE_UNALIGNED) { #if CV_SSE4_1 - static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); - static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); - static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); __m128i a0 = _mm_shuffle_epi8(a.val, sh_a); __m128i b0 = _mm_shuffle_epi8(b.val, sh_b); __m128i c0 = _mm_shuffle_epi8(c.val, sh_c); @@ -2096,10 +2171,6 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24); __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24); __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24); - - _mm_storeu_si128((__m128i*)ptr, v0); - _mm_storeu_si128((__m128i*)(ptr + 8), v1); - _mm_storeu_si128((__m128i*)(ptr + 16), v2); #else __m128i z = _mm_setzero_si128(); __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); @@ -2128,15 +2199,30 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10)); __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6)); __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2)); - - _mm_storeu_si128((__m128i*)(ptr), v0); - _mm_storeu_si128((__m128i*)(ptr + 8), v1); - _mm_storeu_si128((__m128i*)(ptr + 16), v2); #endif + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 8), v1); + _mm_stream_si128((__m128i*)(ptr + 16), v2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 8), v1); + _mm_store_si128((__m128i*)(ptr + 16), v2); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v1); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); + } } inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, - const v_uint16x8& c, const v_uint16x8& d) + const v_uint16x8& c, const v_uint16x8& d, + hal::StoreMode mode = hal::STORE_UNALIGNED) { // a0 a1 a2 a3 .... // b0 b1 b2 b3 .... @@ -2148,27 +2234,58 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ... __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ... - __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... - __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... + __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ... + __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ... __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ... - _mm_storeu_si128((__m128i*)ptr, v0); - _mm_storeu_si128((__m128i*)(ptr + 8), v2); - _mm_storeu_si128((__m128i*)(ptr + 16), v1); - _mm_storeu_si128((__m128i*)(ptr + 24), v3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 8), v1); + _mm_stream_si128((__m128i*)(ptr + 16), v2); + _mm_stream_si128((__m128i*)(ptr + 24), v3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 8), v1); + _mm_store_si128((__m128i*)(ptr + 16), v2); + _mm_store_si128((__m128i*)(ptr + 24), v3); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v1); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); + _mm_storeu_si128((__m128i*)(ptr + 24), v3); + } } -inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b ) +inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + hal::StoreMode mode = hal::STORE_UNALIGNED) { - __m128i t0 = _mm_unpacklo_epi32(a.val, b.val); - __m128i t1 = _mm_unpackhi_epi32(a.val, b.val); + __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); + __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); - _mm_storeu_si128((__m128i*)ptr, t0); - _mm_storeu_si128((__m128i*)(ptr + 4), t1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 4), v1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 4), v1); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 4), v1); + } } inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, - const v_uint32x4& c ) + const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED) { v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3; v_transpose4x4(a, b, c, z, u0, u1, u2, u3); @@ -2177,35 +2294,82 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8)); __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4)); - _mm_storeu_si128((__m128i*)ptr, v0); - _mm_storeu_si128((__m128i*)(ptr + 4), v1); - _mm_storeu_si128((__m128i*)(ptr + 8), v2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 4), v1); + _mm_stream_si128((__m128i*)(ptr + 8), v2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 4), v1); + _mm_store_si128((__m128i*)(ptr + 8), v2); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 4), v1); + _mm_storeu_si128((__m128i*)(ptr + 8), v2); + } } inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, - const v_uint32x4& c, const v_uint32x4& d) + const v_uint32x4& c, const v_uint32x4& d, + hal::StoreMode mode = hal::STORE_UNALIGNED) { - v_uint32x4 t0, t1, t2, t3; - v_transpose4x4(a, b, c, d, t0, t1, t2, t3); - v_store(ptr, t0); - v_store(ptr + 4, t1); - v_store(ptr + 8, t2); - v_store(ptr + 12, t3); + v_uint32x4 v0, v1, v2, v3; + v_transpose4x4(a, b, c, d, v0, v1, v2, v3); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0.val); + _mm_stream_si128((__m128i*)(ptr + 4), v1.val); + _mm_stream_si128((__m128i*)(ptr + 8), v2.val); + _mm_stream_si128((__m128i*)(ptr + 12), v3.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0.val); + _mm_store_si128((__m128i*)(ptr + 4), v1.val); + _mm_store_si128((__m128i*)(ptr + 8), v2.val); + _mm_store_si128((__m128i*)(ptr + 12), v3.val); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0.val); + _mm_storeu_si128((__m128i*)(ptr + 4), v1.val); + _mm_storeu_si128((__m128i*)(ptr + 8), v2.val); + _mm_storeu_si128((__m128i*)(ptr + 12), v3.val); + } } // 2-channel, float only -inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b) +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, + hal::StoreMode mode = hal::STORE_UNALIGNED) { - // a0 a1 a2 a3 ... - // b0 b1 b2 b3 ... - __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1 - __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3 + __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1 + __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3 - _mm_storeu_ps(ptr, u0); - _mm_storeu_ps((ptr + 4), u1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_ps(ptr, v0); + _mm_stream_ps(ptr + 4, v1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_ps(ptr, v0); + _mm_store_ps(ptr + 4, v1); + } + else + { + _mm_storeu_ps(ptr, v0); + _mm_storeu_ps(ptr + 4, v1); + } } -inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED) { __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0)); __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0)); @@ -2217,13 +2381,29 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3)); __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0)); - _mm_storeu_ps(ptr + 0, v0); - _mm_storeu_ps(ptr + 4, v1); - _mm_storeu_ps(ptr + 8, v2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_ps(ptr, v0); + _mm_stream_ps(ptr + 4, v1); + _mm_stream_ps(ptr + 8, v2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_ps(ptr, v0); + _mm_store_ps(ptr + 4, v1); + _mm_store_ps(ptr + 8, v2); + } + else + { + _mm_storeu_ps(ptr, v0); + _mm_storeu_ps(ptr + 4, v1); + _mm_storeu_ps(ptr + 8, v2); + } } inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, - const v_float32x4& c, const v_float32x4& d) + const v_float32x4& c, const v_float32x4& d, + hal::StoreMode mode = hal::STORE_UNALIGNED) { __m128 u0 = _mm_unpacklo_ps(a.val, c.val); __m128 u1 = _mm_unpacklo_ps(b.val, d.val); @@ -2234,43 +2414,109 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 __m128 v1 = _mm_unpackhi_ps(u0, u1); __m128 v3 = _mm_unpackhi_ps(u2, u3); - _mm_storeu_ps(ptr + 0, v0); - _mm_storeu_ps(ptr + 4, v1); - _mm_storeu_ps(ptr + 8, v2); - _mm_storeu_ps(ptr + 12, v3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_ps(ptr, v0); + _mm_stream_ps(ptr + 4, v1); + _mm_stream_ps(ptr + 8, v2); + _mm_stream_ps(ptr + 12, v3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_ps(ptr, v0); + _mm_store_ps(ptr + 4, v1); + _mm_store_ps(ptr + 8, v2); + _mm_store_ps(ptr + 12, v3); + } + else + { + _mm_storeu_ps(ptr, v0); + _mm_storeu_ps(ptr + 4, v1); + _mm_storeu_ps(ptr + 8, v2); + _mm_storeu_ps(ptr + 12, v3); + } } -inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b) +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, + hal::StoreMode mode = hal::STORE_UNALIGNED) { - __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); - __m128i t1 = _mm_unpackhi_epi64(a.val, b.val); + __m128i v0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i v1 = _mm_unpackhi_epi64(a.val, b.val); - _mm_storeu_si128((__m128i*)ptr, t0); - _mm_storeu_si128((__m128i*)(ptr + 2), t1); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 2), v1); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 2), v1); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 2), v1); + } } -inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, + const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED) { - __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); - __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val)); - __m128i t2 = _mm_unpackhi_epi64(b.val, c.val); + __m128i v0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val)); + __m128i v2 = _mm_unpackhi_epi64(b.val, c.val); - _mm_storeu_si128((__m128i*)ptr, t0); - _mm_storeu_si128((__m128i*)(ptr + 2), t1); - _mm_storeu_si128((__m128i*)(ptr + 4), t2); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 2), v1); + _mm_stream_si128((__m128i*)(ptr + 4), v2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 2), v1); + _mm_store_si128((__m128i*)(ptr + 4), v2); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 2), v1); + _mm_storeu_si128((__m128i*)(ptr + 4), v2); + } } -inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d) +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, + const v_uint64x2& c, const v_uint64x2& d, + hal::StoreMode mode = hal::STORE_UNALIGNED) { - __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); - __m128i t1 = _mm_unpacklo_epi64(c.val, d.val); - __m128i t2 = _mm_unpackhi_epi64(a.val, b.val); - __m128i t3 = _mm_unpackhi_epi64(c.val, d.val); + __m128i v0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i v1 = _mm_unpacklo_epi64(c.val, d.val); + __m128i v2 = _mm_unpackhi_epi64(a.val, b.val); + __m128i v3 = _mm_unpackhi_epi64(c.val, d.val); - _mm_storeu_si128((__m128i*)ptr, t0); - _mm_storeu_si128((__m128i*)(ptr + 2), t1); - _mm_storeu_si128((__m128i*)(ptr + 4), t2); - _mm_storeu_si128((__m128i*)(ptr + 6), t3); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm_stream_si128((__m128i*)(ptr), v0); + _mm_stream_si128((__m128i*)(ptr + 2), v1); + _mm_stream_si128((__m128i*)(ptr + 4), v2); + _mm_stream_si128((__m128i*)(ptr + 6), v3); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm_store_si128((__m128i*)(ptr), v0); + _mm_store_si128((__m128i*)(ptr + 2), v1); + _mm_store_si128((__m128i*)(ptr + 4), v2); + _mm_store_si128((__m128i*)(ptr + 6), v3); + } + else + { + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 2), v1); + _mm_storeu_si128((__m128i*)(ptr + 4), v2); + _mm_storeu_si128((__m128i*)(ptr + 6), v3); + } } #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ @@ -2298,27 +2544,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv c0 = v_reinterpret_as_##suffix0(c1); \ d0 = v_reinterpret_as_##suffix0(d1); \ } \ -inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + hal::StoreMode mode = hal::STORE_UNALIGNED ) \ { \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ - v_store_interleave((_Tp1*)ptr, a1, b1); \ + v_store_interleave((_Tp1*)ptr, a1, b1, mode); \ } \ -inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \ { \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ - v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \ } \ inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ - const _Tpvec0& c0, const _Tpvec0& d0 ) \ + const _Tpvec0& c0, const _Tpvec0& d0, \ + hal::StoreMode mode = hal::STORE_UNALIGNED ) \ { \ _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ - v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \ } OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 9ad8234..52bc2cc 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \ { st(a.val, 0, ptr); } \ inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \ { st_a(a.val, 0, ptr); } \ +inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \ +{ st_a(a.val, 0, ptr); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ +{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ { vec_st_l8(a.val, ptr); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ @@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \ _Tpvec& c, _Tpvec& d) \ { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \ -inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \ +inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { vec_st_interleave(a.val, b.val, ptr); } \ inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \ - const _Tpvec& b, const _Tpvec& c) \ + const _Tpvec& b, const _Tpvec& c, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { vec_st_interleave(a.val, b.val, c.val, ptr); } \ inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \ - const _Tpvec& c, const _Tpvec& d) \ + const _Tpvec& c, const _Tpvec& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); } OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16) diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index 354cc00..b158103 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n ) #if CV_SIMD const int VECSZ = v_float32::nlanes; - static const v_float32 vprescale = vx_setall_f32((float)exp_prescale); - static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); - static const v_float32 vminval = vx_setall_f32(minval); - static const v_float32 vmaxval = vx_setall_f32(maxval); + const v_float32 vprescale = vx_setall_f32((float)exp_prescale); + const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); + const v_float32 vminval = vx_setall_f32(minval); + const v_float32 vmaxval = vx_setall_f32(maxval); - static const v_float32 vA1 = vx_setall_f32((float)A1); - static const v_float32 vA2 = vx_setall_f32((float)A2); - static const v_float32 vA3 = vx_setall_f32((float)A3); - static const v_float32 vA4 = vx_setall_f32((float)A4); + const v_float32 vA1 = vx_setall_f32((float)A1); + const v_float32 vA2 = vx_setall_f32((float)A2); + const v_float32 vA3 = vx_setall_f32((float)A3); + const v_float32 vA4 = vx_setall_f32((float)A4); - static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); + const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) @@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n ) #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; - static const v_float64 vprescale = vx_setall_f64(exp_prescale); - static const v_float64 vpostscale = vx_setall_f64(exp_postscale); - static const v_float64 vminval = vx_setall_f64(minval); - static const v_float64 vmaxval = vx_setall_f64(maxval); - - static const v_float64 vA1 = vx_setall_f64(A1); - static const v_float64 vA2 = vx_setall_f64(A2); - static const v_float64 vA3 = vx_setall_f64(A3); - static const v_float64 vA4 = vx_setall_f64(A4); - static const v_float64 vA5 = vx_setall_f64(A5); - - static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); + const v_float64 vprescale = vx_setall_f64(exp_prescale); + const v_float64 vpostscale = vx_setall_f64(exp_postscale); + const v_float64 vminval = vx_setall_f64(minval); + const v_float64 vmaxval = vx_setall_f64(maxval); + + const v_float64 vA1 = vx_setall_f64(A1); + const v_float64 vA2 = vx_setall_f64(A2); + const v_float64 vA3 = vx_setall_f64(A3); + const v_float64 vA4 = vx_setall_f64(A4); + const v_float64 vA5 = vx_setall_f64(A5); + + const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) @@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n ) #if CV_SIMD const int VECSZ = v_float32::nlanes; - static const v_float32 vln2 = vx_setall_f32((float)ln_2); - static const v_float32 v1 = vx_setall_f32(1.f); - static const v_float32 vshift = vx_setall_f32(-1.f/512); + const v_float32 vln2 = vx_setall_f32((float)ln_2); + const v_float32 v1 = vx_setall_f32(1.f); + const v_float32 vshift = vx_setall_f32(-1.f/512); - static const v_float32 vA0 = vx_setall_f32(A0); - static const v_float32 vA1 = vx_setall_f32(A1); - static const v_float32 vA2 = vx_setall_f32(A2); + const v_float32 vA0 = vx_setall_f32(A0); + const v_float32 vA1 = vx_setall_f32(A1); + const v_float32 vA2 = vx_setall_f32(A2); for( ; i < n; i += VECSZ ) { @@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n ) #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; - static const v_float64 vln2 = vx_setall_f64(ln_2); + const v_float64 vln2 = vx_setall_f64(ln_2); - static const v_float64 + const v_float64 vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1), vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3), vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5), diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp index a57d3bb..9c52f0e 100644 --- a/modules/core/src/merge.cpp +++ b/modules/core/src/merge.cpp @@ -9,21 +9,58 @@ namespace cv { namespace hal { #if CV_SIMD +/* + The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following: + on IA there are instructions movntps and such to which + v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped. + Those instructions write directly into memory w/o touching cache + that results in dramatic speed improvements, especially on + large arrays (FullHD, 4K etc.). + + Those intrinsics require the destination address to be aligned + by 16/32 bits (with SSE2 and AVX2, respectively). + So we potentially split the processing into 3 stages: + 1) the optional prefix part [0:i0), where we use simple unaligned stores. + 2) the optional main part [i0:len - VECSZ], where we use "nocache" mode. + But in some cases we have to use unaligned stores in this part. + 3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode + to process the remaining len - VECSZ elements. + In principle there can be very poorly aligned data where there is no main part. + For that we set i0=0 and use unaligned stores for the whole array. +*/ template static void vecmerge_( const T** src, T* dst, int len, int cn ) { - int i; + const int VECSZ = VecT::nlanes; + int i, i0 = 0; const T* src0 = src[0]; const T* src1 = src[1]; - const int VECSZ = VecT::nlanes; + int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T))); + hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE; + if( r != 0 ) + { + mode = hal::STORE_UNALIGNED; + if( r % cn == 0 && len > VECSZ ) + i0 = VECSZ - (r / cn); + } + if( cn == 2 ) { for( i = 0; i < len; i += VECSZ ) { - i = std::min( len - VECSZ, i ); + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } VecT a = vx_load(src0 + i), b = vx_load(src1 + i); - v_store_interleave(dst + i*cn, a, b); + v_store_interleave(dst + i*cn, a, b, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } } } else if( cn == 3 ) @@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn ) const T* src2 = src[2]; for( i = 0; i < len; i += VECSZ ) { - i = std::min( len - VECSZ, i ); + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i); - v_store_interleave(dst + i*cn, a, b, c); + v_store_interleave(dst + i*cn, a, b, c, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } } } else @@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn ) const T* src3 = src[3]; for( i = 0; i < len; i += VECSZ ) { - i = std::min( len - VECSZ, i ); + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } VecT a = vx_load(src0 + i), b = vx_load(src1 + i); VecT c = vx_load(src2 + i), d = vx_load(src3 + i); - v_store_interleave(dst + i*cn, a, b, c, d); + v_store_interleave(dst + i*cn, a, b, c, d, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } } } vx_cleanup(); diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp index 6f7b61a..78d8daa 100644 --- a/modules/core/src/split.cpp +++ b/modules/core/src/split.cpp @@ -9,23 +9,46 @@ namespace cv { namespace hal { #if CV_SIMD +// see the comments for vecmerge_ in merge.cpp template static void vecsplit_( const T* src, T** dst, int len, int cn ) { - int i; + const int VECSZ = VecT::nlanes; + int i, i0 = 0; T* dst0 = dst[0]; T* dst1 = dst[1]; - const int VECSZ = VecT::nlanes; + int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T))); + int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T))); + int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0; + int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0; + + hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE; + if( (r0|r1|r2|r3) != 0 ) + { + mode = hal::STORE_UNALIGNED; + if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ ) + i0 = VECSZ - (r0 / cn); + } + if( cn == 2 ) { for( i = 0; i < len; i += VECSZ ) { - i = std::min( len - VECSZ, i ); + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } VecT a, b; v_load_deinterleave(src + i*cn, a, b); - v_store(dst0 + i, a); - v_store(dst1 + i, b); + v_store(dst0 + i, a, mode); + v_store(dst1 + i, b, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } } } else if( cn == 3 ) @@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn ) T* dst2 = dst[2]; for( i = 0; i < len; i += VECSZ ) { - i = std::min( len - VECSZ, i ); + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } VecT a, b, c; v_load_deinterleave(src + i*cn, a, b, c); - v_store(dst0 + i, a); - v_store(dst1 + i, b); - v_store(dst2 + i, c); + v_store(dst0 + i, a, mode); + v_store(dst1 + i, b, mode); + v_store(dst2 + i, c, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } } } else @@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn ) T* dst3 = dst[3]; for( i = 0; i < len; i += VECSZ ) { - i = std::min( len - VECSZ, i ); + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } VecT a, b, c, d; v_load_deinterleave(src + i*cn, a, b, c, d); - v_store(dst0 + i, a); - v_store(dst1 + i, b); - v_store(dst2 + i, c); - v_store(dst3 + i, d); + v_store(dst0 + i, a, mode); + v_store(dst1 + i, b, mode); + v_store(dst2 + i, c, mode); + v_store(dst3 + i, d, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } } } vx_cleanup();