From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 26 Jul 2018 09:04:28 +0000 (+0300)
Subject: further improvements in split & merge; started using non-temporary store instructions... 
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~597^2~22
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=43820d89b475dd32d11b441eaeef998dcd530752;p=platform%2Fupstream%2Fopencv.git

further improvements in split & merge; started using non-temporary store instructions (#12063)

* 1. changed static const __m128/256 to const __m128/256 to avoid wierd instructions and calls inserted by compiler.
2. added universal intrinsics that wrap MOVNTPS and other such (non-temporary or "no cache" store) instructions. v_store_interleave() and v_store() got respective flags/overloaded variants
3. rewrote split & merge to use the "no cache" store instructions. It resulted in dramatic performance improvement when processing big arrays

* hopefully, fixed some test failures where 4-channel v_store_interleave() is used

* added missing implementation of the new universal intrinsics (v_store_aligned_nocache() etc.)

* fixed silly typo in the new intrinsics in intrin_vsx.hpp

* still trying to fix VSX compiler errors

* still trying to fix VSX compiler errors

* still trying to fix VSX compiler errors

* still trying to fix VSX compiler errors
---

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 031f8f3..9569e61 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -60,6 +60,17 @@
 // access from within opencv code more accessible
 namespace cv {
 
+namespace hal {
+
+enum StoreMode
+{
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
 template<typename _Tp> struct V_TypeTraits
 {
 };
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 4ea66f5..5c2d0b6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -304,6 +304,17 @@ inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1
     { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
     inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
     { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm256_stream_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm256_storeu_si256((__m256i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm256_stream_si256((__m256i*)ptr, a.val); \
+        else \
+            _mm256_store_si256((__m256i*)ptr, a.val); \
+    } \
     inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
     { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
     inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
@@ -338,6 +349,17 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
     { _mm256_storeu_##suffix(ptr, a.val); }                               \
     inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
     { _mm256_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm256_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm256_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm256_stream_##suffix(ptr, a.val); \
+        else \
+            _mm256_store_##suffix(ptr, a.val); \
+    } \
     inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
     { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
     inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
@@ -1616,7 +1638,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b
     __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
     __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
 
-    static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                                                0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
     __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
     __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
@@ -1633,7 +1655,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16&
     __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
     __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
 
-    static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
                                                0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
     __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
     __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
@@ -1683,16 +1705,16 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g,
     __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
     __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
 
-    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
                                                0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
                                                -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
 
     __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
     __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
     __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
 
-    static const __m256i
+    const __m256i
     sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
                             0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
     sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
@@ -1717,18 +1739,18 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16&
     __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
     __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
 
-    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
                                                0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
                                                -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
     __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
     __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
     __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
-    static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+    const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
                                                  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+    const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
                                                  2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
-    static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+    const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
                                                  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
     b0 = _mm256_shuffle_epi8(b0, sh_b);
     g0 = _mm256_shuffle_epi8(g0, sh_g);
@@ -1785,7 +1807,7 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g,
     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
     __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
-    static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                                0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
 
     __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
@@ -1820,7 +1842,7 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16&
     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
     __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
-    static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
                                                0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
     __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
     __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
@@ -1901,7 +1923,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g
 
 ///////////////////////////// store interleave /////////////////////////////////////
 
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
     __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
@@ -1909,11 +1932,25 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x3
     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, xy0);
-    _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 32), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
+    }
 }
 
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
     __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
@@ -1921,11 +1958,25 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint1
     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, xy0);
-    _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 16), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+    }
 }
 
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
     __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
@@ -1933,11 +1984,25 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint
     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, xy0);
-    _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 8), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+    }
 }
 
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
     __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
@@ -1945,19 +2010,33 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64
     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, xy0);
-    _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 4), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+    }
 }
 
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
-    static const __m256i sh_b = _mm256_setr_epi8(
+    const __m256i sh_b = _mm256_setr_epi8(
             0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
             0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
-    static const __m256i sh_g = _mm256_setr_epi8(
+    const __m256i sh_g = _mm256_setr_epi8(
             5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
             5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
-    static const __m256i sh_r = _mm256_setr_epi8(
+    const __m256i sh_r = _mm256_setr_epi8(
             10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
             10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
 
@@ -1965,9 +2044,9 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
     __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
     __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
 
-    static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+    const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
                                                0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
                                                0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
 
     __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
@@ -1978,20 +2057,36 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
     __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
     __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgr0);
-    _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
-    _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+    }
 }
 
-inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
-    static const __m256i sh_b = _mm256_setr_epi8(
+    const __m256i sh_b = _mm256_setr_epi8(
          0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
          0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m256i sh_g = _mm256_setr_epi8(
+    const __m256i sh_g = _mm256_setr_epi8(
          10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
          10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
-    static const __m256i sh_r = _mm256_setr_epi8(
+    const __m256i sh_r = _mm256_setr_epi8(
          4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
          4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
 
@@ -1999,9 +2094,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
     __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
     __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
 
-    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
                                                0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
-    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
                                                -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
 
     __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
@@ -2012,12 +2107,28 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
     //__m256i bgr1 = p1;
     __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgr0);
-    _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
-    _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), p1);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 16), p1);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+    }
 }
 
-inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
     __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
@@ -2031,12 +2142,28 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint
     //__m256i bgr1 = p2;
     __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgr0);
-    _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
-    _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), p2);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 8), p2);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), p2);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
+    }
 }
 
-inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
     __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
@@ -2046,12 +2173,29 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64
     __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
     __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgr0);
-    _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
-    _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+    }
 }
 
-inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
+                                const v_uint8x32& r, const v_uint8x32& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
     __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
@@ -2068,14 +2212,32 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x3
     __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
     __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgra0);
-    _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
-    _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
-    _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+    }
 }
 
 inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
-                                const v_uint16x16& r, const v_uint16x16& a )
+                                const v_uint16x16& r, const v_uint16x16& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
     __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
@@ -2092,14 +2254,32 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint1
     __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
     __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgra0);
-    _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
-    _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
-    _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+    }
 }
 
 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
-                                const v_uint32x8& r, const v_uint32x8& a )
+                                const v_uint32x8& r, const v_uint32x8& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
     __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
@@ -2116,14 +2296,32 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint
     __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
     __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgra0);
-    _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
-    _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
-    _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+    }
 }
 
 inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
-                                const v_uint64x4& r, const v_uint64x4& a )
+                                const v_uint64x4& r, const v_uint64x4& a,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
 {
     __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
     __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
@@ -2135,10 +2333,27 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64
     __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
     __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
 
-    _mm256_storeu_si256((__m256i*)ptr, bgra0);
-    _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
-    _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
-    _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+    }
 }
 
 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
@@ -2166,27 +2381,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv
     c0 = v_reinterpret_as_##suffix0(c1); \
     d0 = v_reinterpret_as_##suffix0(d1); \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
 { \
     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
 { \
     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
 } \
 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
-                                const _Tpvec0& c0, const _Tpvec0& d0 ) \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
 { \
     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
     _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
 }
 
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 1f5f531..61d58db 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -1319,7 +1319,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                               const v_reg<_Tp, n>& b)
+                               const v_reg<_Tp, n>& b,
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
     int i, i2;
     for( i = i2 = 0; i < n; i++, i2 += 2 )
@@ -1339,7 +1340,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n>
 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
     int i, i3;
     for( i = i3 = 0; i < n; i++, i3 += 3 )
@@ -1360,7 +1362,8 @@ Scheme:
 For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d)
+                                                            const v_reg<_Tp, n>& d,
+                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
 {
     int i, i4;
     for( i = i4 = 0; i < n; i++, i4 += 4 )
@@ -1430,6 +1433,20 @@ inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
         ptr[i] = a.s[i];
 }
 
+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
 /** @brief Combine vector from first elements of two vectors
 
 Scheme:
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index d806730..b601e3e 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -864,6 +864,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@@ -1292,14 +1296,16 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
     c.val = v.val[2]; \
     d.val = v.val[3]; \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     _Tpvec##x2_t v; \
     v.val[0] = a.val; \
     v.val[1] = b.val; \
     vst2q_##suffix(ptr, v); \
 } \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     _Tpvec##x3_t v; \
     v.val[0] = a.val; \
@@ -1308,7 +1314,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
     vst3q_##suffix(ptr, v); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
     _Tpvec##x4_t v; \
     v.val[0] = a.val; \
@@ -1360,7 +1367,8 @@ inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
     d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
 } \
  \
-inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@@ -1369,7 +1377,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
 } \
  \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
-                                const v_##tp##x2& b, const v_##tp##x2& c ) \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
@@ -1380,7 +1389,8 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
 } \
  \
 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
-                                const v_##tp##x2& c, const v_##tp##x2& d ) \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
     vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 4971c77..6e07940 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -788,7 +788,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
 
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
-    static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
     __m128 t = x.val;
     __m128 h = _mm_mul_ps(t, _0_5);
     t = _mm_rsqrt_ps(t);
@@ -801,7 +801,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
 
 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 {
-    static const __m128d v_1 = _mm_set1_pd(1.);
+    const __m128d v_1 = _mm_set1_pd(1.);
     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
 }
 
@@ -1261,6 +1261,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_si128((__m128i*)ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_si128((__m128i*)ptr, a.val); \
+    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_si128((__m128i*)ptr, a.val); \
+    else \
+        _mm_store_si128((__m128i*)ptr, a.val); \
+} \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@@ -1292,6 +1303,17 @@ inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { _mm_storeu_##suffix(ptr, a.val); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_##suffix(ptr, a.val); \
+    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_##suffix(ptr, a.val); \
+    else \
+        _mm_store_##suffix(ptr, a.val); \
+} \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
@@ -1671,17 +1693,17 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
 {
 #if CV_SSE4_1
-    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
-    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
     __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
     __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
     __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
     __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
     __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
     __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
-    static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
-    static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
-    static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
     a0 = _mm_shuffle_epi8(a0, sh_b);
     b0 = _mm_shuffle_epi8(b0, sh_g);
     c0 = _mm_shuffle_epi8(c0, sh_r);
@@ -1689,9 +1711,9 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
     b.val = b0;
     c.val = c0;
 #elif CV_SSSE3
-    static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
-    static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
-    static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+    const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+    const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
 
     __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
     __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
@@ -1784,9 +1806,9 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
     __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
     __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
 
-    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
-    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
     a0 = _mm_shuffle_epi8(a0, sh_a);
     b0 = _mm_shuffle_epi8(b0, sh_b);
     c0 = _mm_shuffle_epi8(c0, sh_c);
@@ -1955,55 +1977,61 @@ inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
 
 // store interleave
 
-inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
     __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
     __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
 
-    _mm_storeu_si128((__m128i*)(ptr), v0);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    }
 }
 
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
-                                const v_uint8x16& c )
+                                const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
 #if CV_SSE4_1
-    static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
-    static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
-    static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
     __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
     __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
     __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
 
-    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
-    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
     __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
     __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
     __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
-
-    _mm_storeu_si128((__m128i*)(ptr), v0);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
 #elif CV_SSSE3
-    static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
-    static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
-    static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+    const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+    const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+    const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
 
     __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
     t0 = _mm_alignr_epi8(c.val, t0, 5);
-    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+    __m128i v0 = _mm_shuffle_epi8(t0, m0);
 
     __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
     t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
-    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+    __m128i v1 = _mm_shuffle_epi8(t1, m1);
 
     __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
     t2 = _mm_alignr_epi8(t2, a.val, 11);
-    __m128i s2 = _mm_shuffle_epi8(t2, m2);
-
-    _mm_storeu_si128((__m128i*)ptr, s0);
-    _mm_storeu_si128((__m128i*)(ptr + 16), s1);
-    _mm_storeu_si128((__m128i*)(ptr + 32), s2);
+    __m128i v2 = _mm_shuffle_epi8(t2, m2);
 #else
     __m128i z = _mm_setzero_si128();
     __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
@@ -2042,15 +2070,31 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
     __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
     __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
     __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
-
-    _mm_storeu_si128((__m128i*)(ptr), v0);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
 #endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+    }
 }
 
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
-                                const v_uint8x16& c, const v_uint8x16& d)
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
     // a0 a1 a2 a3 ....
     // b0 b1 b2 b3 ....
@@ -2062,33 +2106,64 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
     __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
 
     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
-    __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
-    __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
 
-    _mm_storeu_si128((__m128i*)ptr, v0);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
-    _mm_storeu_si128((__m128i*)(ptr + 32), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+        _mm_stream_si128((__m128i*)(ptr + 48), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+        _mm_store_si128((__m128i*)(ptr + 48), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+    }
 }
 
-inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0, t1;
-    t0 = _mm_unpacklo_epi16(a.val, b.val);
-    t1 = _mm_unpackhi_epi16(a.val, b.val);
-    _mm_storeu_si128((__m128i*)(ptr), t0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), t1);
+    __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    }
 }
 
 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
-                                const v_uint16x8& b,
-                                const v_uint16x8& c )
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
 #if CV_SSE4_1
-    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
-    static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
-    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
     __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
     __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
     __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
@@ -2096,10 +2171,6 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
     __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
     __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
     __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
-
-    _mm_storeu_si128((__m128i*)ptr, v0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
 #else
     __m128i z = _mm_setzero_si128();
     __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
@@ -2128,15 +2199,30 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
     __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
     __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
     __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
-
-    _mm_storeu_si128((__m128i*)(ptr), v0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
 #endif
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    }
 }
 
 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
-                                const v_uint16x8& c, const v_uint16x8& d)
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
     // a0 a1 a2 a3 ....
     // b0 b1 b2 b3 ....
@@ -2148,27 +2234,58 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16
     __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
 
     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
-    __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
-    __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
 
-    _mm_storeu_si128((__m128i*)ptr, v0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+        _mm_stream_si128((__m128i*)(ptr + 24), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+        _mm_store_si128((__m128i*)(ptr + 24), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+    }
 }
 
-inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
-    __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
 
-    _mm_storeu_si128((__m128i*)ptr, t0);
-    _mm_storeu_si128((__m128i*)(ptr + 4), t1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+    }
 }
 
 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
-                                const v_uint32x4& c )
+                                const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
     v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
     v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
@@ -2177,35 +2294,82 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint
     __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
     __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
 
-    _mm_storeu_si128((__m128i*)ptr, v0);
-    _mm_storeu_si128((__m128i*)(ptr + 4), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+        _mm_store_si128((__m128i*)(ptr + 8), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    }
 }
 
 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
-                               const v_uint32x4& c, const v_uint32x4& d)
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    v_uint32x4 t0, t1, t2, t3;
-    v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
-    v_store(ptr, t0);
-    v_store(ptr + 4, t1);
-    v_store(ptr + 8, t2);
-    v_store(ptr + 12, t3);
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0.val);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0.val);
+        _mm_store_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_store_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_store_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0.val);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
+    }
 }
 
 // 2-channel, float only
-inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    // a0 a1 a2 a3 ...
-    // b0 b1 b2 b3 ...
-    __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
-    __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
+    __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
+    __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
 
-    _mm_storeu_ps(ptr, u0);
-    _mm_storeu_ps((ptr + 4), u1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+    }
 }
 
-inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
     __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
     __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
@@ -2217,13 +2381,29 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
     __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
     __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
 
-    _mm_storeu_ps(ptr + 0, v0);
-    _mm_storeu_ps(ptr + 4, v1);
-    _mm_storeu_ps(ptr + 8, v2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+    }
 }
 
 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
-                               const v_float32x4& c, const v_float32x4& d)
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
     __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
     __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
@@ -2234,43 +2414,109 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
     __m128 v1 = _mm_unpackhi_ps(u0, u1);
     __m128 v3 = _mm_unpackhi_ps(u2, u3);
 
-    _mm_storeu_ps(ptr +  0, v0);
-    _mm_storeu_ps(ptr +  4, v1);
-    _mm_storeu_ps(ptr +  8, v2);
-    _mm_storeu_ps(ptr + 12, v3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+        _mm_stream_ps(ptr + 12, v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+        _mm_store_ps(ptr + 12, v3);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+        _mm_storeu_ps(ptr + 12, v3);
+    }
 }
 
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
-    __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
 
-    _mm_storeu_si128((__m128i*)ptr, t0);
-    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+    }
 }
 
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
-    __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
-    __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+    __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
 
-    _mm_storeu_si128((__m128i*)ptr, t0);
-    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
-    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+    }
 }
 
-inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
 {
-    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
-    __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
-    __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
-    __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
 
-    _mm_storeu_si128((__m128i*)ptr, t0);
-    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
-    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
-    _mm_storeu_si128((__m128i*)(ptr + 6), t3);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+        _mm_stream_si128((__m128i*)(ptr + 6), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+        _mm_store_si128((__m128i*)(ptr + 6), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 6), v3);
+    }
 }
 
 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
@@ -2298,27 +2544,30 @@ inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpv
     c0 = v_reinterpret_as_##suffix0(c1); \
     d0 = v_reinterpret_as_##suffix0(d1); \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
 { \
     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
 } \
-inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
 { \
     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
 } \
 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
-                               const _Tpvec0& c0, const _Tpvec0& d0 ) \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
 { \
     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
     _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
-    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
 }
 
 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 9ad8234..52bc2cc 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -249,6 +249,10 @@ inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
 { st(a.val, 0, ptr); }                                                      \
 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
 { st_a(a.val, 0, ptr); }                                                    \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
@@ -281,13 +285,16 @@ inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
                                                 _Tpvec& c, _Tpvec& d)        \
 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
-inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)   \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, ptr); }                                    \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
-                               const _Tpvec& b, const _Tpvec& c)             \
+                               const _Tpvec& b, const _Tpvec& c,             \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
-                                         const _Tpvec& c, const _Tpvec& d)   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
 
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index 354cc00..b158103 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -515,17 +515,17 @@ void exp32f( const float *_x, float *y, int n )
 
 #if CV_SIMD
     const int VECSZ = v_float32::nlanes;
-    static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
-    static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
-    static const v_float32 vminval = vx_setall_f32(minval);
-    static const v_float32 vmaxval = vx_setall_f32(maxval);
+    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+    const v_float32 vminval = vx_setall_f32(minval);
+    const v_float32 vmaxval = vx_setall_f32(maxval);
 
-    static const v_float32 vA1 = vx_setall_f32((float)A1);
-    static const v_float32 vA2 = vx_setall_f32((float)A2);
-    static const v_float32 vA3 = vx_setall_f32((float)A3);
-    static const v_float32 vA4 = vx_setall_f32((float)A4);
+    const v_float32 vA1 = vx_setall_f32((float)A1);
+    const v_float32 vA2 = vx_setall_f32((float)A2);
+    const v_float32 vA3 = vx_setall_f32((float)A3);
+    const v_float32 vA4 = vx_setall_f32((float)A4);
 
-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
     bool y_aligned = (size_t)(void*)y % 32 == 0;
 
     for( ; i < n; i += VECSZ*2 )
@@ -627,18 +627,18 @@ void exp64f( const double *_x, double *y, int n )
 
 #if CV_SIMD_64F
     const int VECSZ = v_float64::nlanes;
-    static const v_float64 vprescale = vx_setall_f64(exp_prescale);
-    static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
-    static const v_float64 vminval = vx_setall_f64(minval);
-    static const v_float64 vmaxval = vx_setall_f64(maxval);
-
-    static const v_float64 vA1 = vx_setall_f64(A1);
-    static const v_float64 vA2 = vx_setall_f64(A2);
-    static const v_float64 vA3 = vx_setall_f64(A3);
-    static const v_float64 vA4 = vx_setall_f64(A4);
-    static const v_float64 vA5 = vx_setall_f64(A5);
-
-    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    const v_float64 vprescale = vx_setall_f64(exp_prescale);
+    const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+    const v_float64 vminval = vx_setall_f64(minval);
+    const v_float64 vmaxval = vx_setall_f64(maxval);
+
+    const v_float64 vA1 = vx_setall_f64(A1);
+    const v_float64 vA2 = vx_setall_f64(A2);
+    const v_float64 vA3 = vx_setall_f64(A3);
+    const v_float64 vA4 = vx_setall_f64(A4);
+    const v_float64 vA5 = vx_setall_f64(A5);
+
+    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
     bool y_aligned = (size_t)(void*)y % 32 == 0;
 
     for( ; i < n; i += VECSZ*2 )
@@ -1024,13 +1024,13 @@ void log32f( const float *_x, float *y, int n )
 
 #if CV_SIMD
     const int VECSZ = v_float32::nlanes;
-    static const v_float32 vln2 = vx_setall_f32((float)ln_2);
-    static const v_float32 v1 = vx_setall_f32(1.f);
-    static const v_float32 vshift = vx_setall_f32(-1.f/512);
+    const v_float32 vln2 = vx_setall_f32((float)ln_2);
+    const v_float32 v1 = vx_setall_f32(1.f);
+    const v_float32 vshift = vx_setall_f32(-1.f/512);
 
-    static const v_float32 vA0 = vx_setall_f32(A0);
-    static const v_float32 vA1 = vx_setall_f32(A1);
-    static const v_float32 vA2 = vx_setall_f32(A2);
+    const v_float32 vA0 = vx_setall_f32(A0);
+    const v_float32 vA1 = vx_setall_f32(A1);
+    const v_float32 vA2 = vx_setall_f32(A2);
 
     for( ; i < n; i += VECSZ )
     {
@@ -1097,9 +1097,9 @@ void log64f( const double *x, double *y, int n )
 
 #if CV_SIMD_64F
     const int VECSZ = v_float64::nlanes;
-    static const v_float64 vln2 = vx_setall_f64(ln_2);
+    const v_float64 vln2 = vx_setall_f64(ln_2);
 
-    static const v_float64
+    const v_float64
         vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
         vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
         vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp
index a57d3bb..9c52f0e 100644
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -9,21 +9,58 @@
 namespace cv { namespace hal {
 
 #if CV_SIMD
+/*
+  The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
+  on IA there are instructions movntps and such to which
+  v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped.
+  Those instructions write directly into memory w/o touching cache
+  that results in dramatic speed improvements, especially on
+  large arrays (FullHD, 4K etc.).
+
+  Those intrinsics require the destination address to be aligned
+  by 16/32 bits (with SSE2 and AVX2, respectively).
+  So we potentially split the processing into 3 stages:
+  1) the optional prefix part [0:i0), where we use simple unaligned stores.
+  2) the optional main part [i0:len - VECSZ], where we use "nocache" mode.
+     But in some cases we have to use unaligned stores in this part.
+  3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode
+     to process the remaining len - VECSZ elements.
+  In principle there can be very poorly aligned data where there is no main part.
+  For that we set i0=0 and use unaligned stores for the whole array.
+*/
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
+    int i, i0 = 0;
     const T* src0 = src[0];
     const T* src1 = src[1];
 
-    const int VECSZ = VecT::nlanes;
+    int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
+    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+    if( r != 0 )
+    {
+        mode = hal::STORE_UNALIGNED;
+        if( r % cn == 0 && len > VECSZ )
+            i0 = VECSZ - (r / cn);
+    }
+
     if( cn == 2 )
     {
         for( i = 0; i < len; i += VECSZ )
         {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
             VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
-            v_store_interleave(dst + i*cn, a, b);
+            v_store_interleave(dst + i*cn, a, b, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
         }
     }
     else if( cn == 3 )
@@ -31,9 +68,18 @@ vecmerge_( const T** src, T* dst, int len, int cn )
         const T* src2 = src[2];
         for( i = 0; i < len; i += VECSZ )
         {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
             VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
-            v_store_interleave(dst + i*cn, a, b, c);
+            v_store_interleave(dst + i*cn, a, b, c, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
         }
     }
     else
@@ -43,10 +89,19 @@ vecmerge_( const T** src, T* dst, int len, int cn )
         const T* src3 = src[3];
         for( i = 0; i < len; i += VECSZ )
         {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
             VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
             VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
-            v_store_interleave(dst + i*cn, a, b, c, d);
+            v_store_interleave(dst + i*cn, a, b, c, d, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
         }
     }
     vx_cleanup();
diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp
index 6f7b61a..78d8daa 100644
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -9,23 +9,46 @@
 namespace cv { namespace hal {
 
 #if CV_SIMD
+// see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    int i;
+    const int VECSZ = VecT::nlanes;
+    int i, i0 = 0;
     T* dst0 = dst[0];
     T* dst1 = dst[1];
 
-    const int VECSZ = VecT::nlanes;
+    int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T)));
+    int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T)));
+    int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0;
+    int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0;
+
+    hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
+    if( (r0|r1|r2|r3) != 0 )
+    {
+        mode = hal::STORE_UNALIGNED;
+        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
+            i0 = VECSZ - (r0 / cn);
+    }
+
     if( cn == 2 )
     {
         for( i = 0; i < len; i += VECSZ )
         {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
             VecT a, b;
             v_load_deinterleave(src + i*cn, a, b);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
         }
     }
     else if( cn == 3 )
@@ -33,12 +56,21 @@ vecsplit_( const T* src, T** dst, int len, int cn )
         T* dst2 = dst[2];
         for( i = 0; i < len; i += VECSZ )
         {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
             VecT a, b, c;
             v_load_deinterleave(src + i*cn, a, b, c);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
-            v_store(dst2 + i, c);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            v_store(dst2 + i, c, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
         }
     }
     else
@@ -48,13 +80,22 @@ vecsplit_( const T* src, T** dst, int len, int cn )
         T* dst3 = dst[3];
         for( i = 0; i < len; i += VECSZ )
         {
-            i = std::min( len - VECSZ, i );
+            if( i > len - VECSZ )
+            {
+                i = len - VECSZ;
+                mode = hal::STORE_UNALIGNED;
+            }
             VecT a, b, c, d;
             v_load_deinterleave(src + i*cn, a, b, c, d);
-            v_store(dst0 + i, a);
-            v_store(dst1 + i, b);
-            v_store(dst2 + i, c);
-            v_store(dst3 + i, d);
+            v_store(dst0 + i, a, mode);
+            v_store(dst1 + i, b, mode);
+            v_store(dst2 + i, c, mode);
+            v_store(dst3 + i, d, mode);
+            if( i < i0 )
+            {
+                i = i0 - VECSZ;
+                mode = hal::STORE_ALIGNED_NOCACHE;
+            }
         }
     }
     vx_cleanup();