From 43d925609613f272bb5924d4e948e509c56ce2e2 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@mediana.jetos.com>
Date: Wed, 5 Sep 2018 16:17:43 +0300
Subject: [PATCH] Replaced core module calls to universal intrinsics with wide
 universal intrinsics

---
 modules/core/src/copy.cpp      |  30 +++----
 modules/core/src/lapack.cpp    |  89 ++++++++++----------
 modules/core/src/mathfuncs.cpp | 121 +++++++++++++--------------
 modules/core/src/matmul.cpp    | 180 ++++++++++++++++++-----------------------
 4 files changed, 204 insertions(+), 216 deletions(-)
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 38264cc..98ab15d 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -90,20 +90,21 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
         const uchar* src = (const uchar*)_src;
         uchar* dst = (uchar*)_dst;
         int x = 0;
-        #if CV_SIMD128
+        #if CV_SIMD
         {
-            v_uint8x16 v_zero = v_setzero_u8();
+            v_uint8 v_zero = vx_setzero_u8();
 
-            for( ; x <= size.width - 16; x += 16 )
+            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
             {
-                v_uint8x16 v_src   = v_load(src  + x),
-                           v_dst   = v_load(dst  + x),
-                           v_nmask = v_load(mask + x) == v_zero;
+                v_uint8 v_src   = vx_load(src  + x),
+                        v_dst   = vx_load(dst  + x),
+                        v_nmask = vx_load(mask + x) == v_zero;
 
                 v_dst = v_select(v_nmask, v_dst, v_src);
                 v_store(dst + x, v_dst);
             }
         }
+        vx_cleanup();
         #endif
         for( ; x < size.width; x++ )
             if( mask[x] )
@@ -121,25 +122,26 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
         const ushort* src = (const ushort*)_src;
         ushort* dst = (ushort*)_dst;
         int x = 0;
-        #if CV_SIMD128
+        #if CV_SIMD
         {
-            v_uint8x16 v_zero = v_setzero_u8();
+            v_uint8 v_zero = vx_setzero_u8();
 
-            for( ; x <= size.width - 16; x += 16 )
+            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
             {
-                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8),
-                           v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8);
+                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes),
+                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes);
 
-                v_uint8x16 v_nmask1, v_nmask2;
-                v_uint8x16 v_nmask = v_load(mask + x) == v_zero;
+                v_uint8 v_nmask1, v_nmask2;
+                v_uint8 v_nmask = vx_load(mask + x) == v_zero;
                 v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
 
                 v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
                 v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
                 v_store(dst + x, v_dst1);
-                v_store(dst + x + 8, v_dst2);
+                v_store(dst + x + v_uint16::nlanes, v_dst2);
             }
         }
+        vx_cleanup();
         #endif
         for( ; x < size.width; x++ )
             if( mask[x] )
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 3fe2d77..649f6ba 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -277,40 +277,42 @@ template<typename T> struct VBLAS
     int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
 };
 
-#if CV_SIMD128
+#if CV_SIMD
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
-    if( n < 8 )
+    if( n < 2*v_float32::nlanes )
         return 0;
     int k = 0;
-    v_float32x4 s0 = v_setzero_f32();
-    for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
+    v_float32 s0 = vx_setzero_f32();
+    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
     {
-        v_float32x4 a0 = v_load(a + k);
-        v_float32x4 b0 = v_load(b + k);
+        v_float32 a0 = vx_load(a + k);
+        v_float32 b0 = vx_load(b + k);
 
         s0 += a0 * b0;
     }
     *result = v_reduce_sum(s0);
+    vx_cleanup();
     return k;
 }
 
 
 template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, float s) const
 {
-    if( n < 4 )
+    if( n < v_float32::nlanes)
         return 0;
     int k = 0;
-    v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s);
-    for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
+    v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
+    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
     {
-        v_float32x4 a0 = v_load(a + k);
-        v_float32x4 b0 = v_load(b + k);
-        v_float32x4 t0 = (a0 * c4) + (b0 * s4);
-        v_float32x4 t1 = (b0 * c4) - (a0 * s4);
+        v_float32 a0 = vx_load(a + k);
+        v_float32 b0 = vx_load(b + k);
+        v_float32 t0 = (a0 * c4) + (b0 * s4);
+        v_float32 t1 = (b0 * c4) - (a0 * s4);
         v_store(a + k, t0);
         v_store(b + k, t1);
     }
+    vx_cleanup();
     return k;
 }
 
@@ -318,17 +320,17 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
 template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c, float s,
                                              float* anorm, float* bnorm) const
 {
-    if( n < 4 )
+    if( n < v_float32::nlanes)
         return 0;
     int k = 0;
-    v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s);
-    v_float32x4 sa = v_setzero_f32(), sb = v_setzero_f32();
-    for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes )
+    v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
+    v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32();
+    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
     {
-        v_float32x4 a0 = v_load(a + k);
-        v_float32x4 b0 = v_load(b + k);
-        v_float32x4 t0 = (a0 * c4) + (b0 * s4);
-        v_float32x4 t1 = (b0 * c4) - (a0 * s4);
+        v_float32 a0 = vx_load(a + k);
+        v_float32 b0 = vx_load(b + k);
+        v_float32 t0 = (a0 * c4) + (b0 * s4);
+        v_float32 t1 = (b0 * c4) - (a0 * s4);
         v_store(a + k, t0);
         v_store(b + k, t1);
         sa += t0 + t0;
@@ -336,26 +338,28 @@ template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c,
     }
     *anorm = v_reduce_sum(sa);
     *bnorm = v_reduce_sum(sb);
+    vx_cleanup();
     return k;
 }
 
-#if CV_SIMD128_64F
+#if CV_SIMD_64F
 template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
 {
-    if( n < 4 )
+    if( n < 2*v_float64::nlanes )
         return 0;
     int k = 0;
-    v_float64x2 s0 = v_setzero_f64();
-    for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
+    v_float64 s0 = vx_setzero_f64();
+    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
     {
-        v_float64x2 a0 = v_load(a + k);
-        v_float64x2 b0 = v_load(b + k);
+        v_float64 a0 = vx_load(a + k);
+        v_float64 b0 = vx_load(b + k);
 
         s0 += a0 * b0;
     }
     double sbuf[2];
     v_store(sbuf, s0);
     *result = sbuf[0] + sbuf[1];
+    vx_cleanup();
     return k;
 }
 
@@ -363,16 +367,17 @@ template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n
 template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double c, double s) const
 {
     int k = 0;
-    v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s);
-    for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
+    v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
+    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
     {
-        v_float64x2 a0 = v_load(a + k);
-        v_float64x2 b0 = v_load(b + k);
-        v_float64x2 t0 = (a0 * c2) + (b0 * s2);
-        v_float64x2 t1 = (b0 * c2) - (a0 * s2);
+        v_float64 a0 = vx_load(a + k);
+        v_float64 b0 = vx_load(b + k);
+        v_float64 t0 = (a0 * c2) + (b0 * s2);
+        v_float64 t1 = (b0 * c2) - (a0 * s2);
         v_store(a + k, t0);
         v_store(b + k, t1);
     }
+    vx_cleanup();
     return k;
 }
 
@@ -381,14 +386,14 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
                                               double* anorm, double* bnorm) const
 {
     int k = 0;
-    v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s);
-    v_float64x2 sa = v_setzero_f64(), sb = v_setzero_f64();
-    for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes )
+    v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
+    v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64();
+    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
     {
-        v_float64x2 a0 = v_load(a + k);
-        v_float64x2 b0 = v_load(b + k);
-        v_float64x2 t0 = (a0 * c2) + (b0 * s2);
-        v_float64x2 t1 = (b0 * c2) - (a0 * s2);
+        v_float64 a0 = vx_load(a + k);
+        v_float64 b0 = vx_load(b + k);
+        v_float64 t0 = (a0 * c2) + (b0 * s2);
+        v_float64 t1 = (b0 * c2) - (a0 * s2);
         v_store(a + k, t0);
         v_store(b + k, t1);
         sa += t0 * t0;
@@ -401,8 +406,8 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
     *bnorm = bbuf[0] + bbuf[1];
     return k;
 }
-#endif //CV_SIMD128_64F
-#endif //CV_SIMD128
+#endif //CV_SIMD_64F
+#endif //CV_SIMD
 
 template<typename _Tp> void
 JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index e8067b5..ba7f688 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -606,17 +606,15 @@ void polarToCart( InputArray src1, InputArray src2,
                 {
                     k = 0;
 
-#if CV_SIMD128
-                    if( hasSIMD128() )
+#if CV_SIMD
+                    int cWidth = v_float32::nlanes;
+                    for( ; k <= len - cWidth; k += cWidth )
                     {
-                        int cWidth = v_float32x4::nlanes;
-                        for( ; k <= len - cWidth; k += cWidth )
-                        {
-                            v_float32x4 v_m = v_load(mag + k);
-                            v_store(x + k, v_load(x + k) * v_m);
-                            v_store(y + k, v_load(y + k) * v_m);
-                        }
+                        v_float32 v_m = vx_load(mag + k);
+                        v_store(x + k, vx_load(x + k) * v_m);
+                        v_store(y + k, vx_load(y + k) * v_m);
                     }
+                    vx_cleanup();
 #endif
 
                     for( ; k < len; k++ )
@@ -735,7 +733,7 @@ struct iPow_SIMD
     }
 };
 
-#if CV_SIMD128
+#if CV_SIMD
 
 template <>
 struct iPow_SIMD<uchar, int>
@@ -743,13 +741,13 @@ struct iPow_SIMD<uchar, int>
     int operator() ( const uchar * src, uchar * dst, int len, int power )
     {
         int i = 0;
-        v_uint32x4 v_1 = v_setall_u32(1u);
+        v_uint32 v_1 = vx_setall_u32(1u);
 
-        for ( ; i <= len - 8; i += 8)
+        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
         {
-            v_uint32x4 v_a1 = v_1, v_a2 = v_1;
-            v_uint16x8 v = v_load_expand(src + i);
-            v_uint32x4 v_b1, v_b2;
+            v_uint32 v_a1 = v_1, v_a2 = v_1;
+            v_uint16 v = vx_load_expand(src + i);
+            v_uint32 v_b1, v_b2;
             v_expand(v, v_b1, v_b2);
             int p = power;
 
@@ -771,6 +769,7 @@ struct iPow_SIMD<uchar, int>
             v = v_pack(v_a1, v_a2);
             v_pack_store(dst + i, v);
         }
+        vx_cleanup();
 
         return i;
     }
@@ -782,13 +781,13 @@ struct iPow_SIMD<schar, int>
     int operator() ( const schar * src, schar * dst, int len, int power)
     {
         int i = 0;
-        v_int32x4 v_1 = v_setall_s32(1);
+        v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - 8; i += 8)
+        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
         {
-            v_int32x4 v_a1 = v_1, v_a2 = v_1;
-            v_int16x8 v = v_load_expand(src + i);
-            v_int32x4 v_b1, v_b2;
+            v_int32 v_a1 = v_1, v_a2 = v_1;
+            v_int16 v = vx_load_expand(src + i);
+            v_int32 v_b1, v_b2;
             v_expand(v, v_b1, v_b2);
             int p = power;
 
@@ -810,6 +809,7 @@ struct iPow_SIMD<schar, int>
             v = v_pack(v_a1, v_a2);
             v_pack_store(dst + i, v);
         }
+        vx_cleanup();
 
         return i;
     }
@@ -821,13 +821,13 @@ struct iPow_SIMD<ushort, int>
     int operator() ( const ushort * src, ushort * dst, int len, int power)
     {
         int i = 0;
-        v_uint32x4 v_1 = v_setall_u32(1u);
+        v_uint32 v_1 = vx_setall_u32(1u);
 
-        for ( ; i <= len - 8; i += 8)
+        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
         {
-            v_uint32x4 v_a1 = v_1, v_a2 = v_1;
-            v_uint16x8 v = v_load(src + i);
-            v_uint32x4 v_b1, v_b2;
+            v_uint32 v_a1 = v_1, v_a2 = v_1;
+            v_uint16 v = vx_load(src + i);
+            v_uint32 v_b1, v_b2;
             v_expand(v, v_b1, v_b2);
             int p = power;
 
@@ -849,6 +849,7 @@ struct iPow_SIMD<ushort, int>
             v = v_pack(v_a1, v_a2);
             v_store(dst + i, v);
         }
+        vx_cleanup();
 
         return i;
     }
@@ -860,13 +861,13 @@ struct iPow_SIMD<short, int>
     int operator() ( const short * src, short * dst, int len, int power)
     {
         int i = 0;
-        v_int32x4 v_1 = v_setall_s32(1);
+        v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - 8; i += 8)
+        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
         {
-            v_int32x4 v_a1 = v_1, v_a2 = v_1;
-            v_int16x8 v = v_load(src + i);
-            v_int32x4 v_b1, v_b2;
+            v_int32 v_a1 = v_1, v_a2 = v_1;
+            v_int16 v = vx_load(src + i);
+            v_int32 v_b1, v_b2;
             v_expand(v, v_b1, v_b2);
             int p = power;
 
@@ -888,6 +889,7 @@ struct iPow_SIMD<short, int>
             v = v_pack(v_a1, v_a2);
             v_store(dst + i, v);
         }
+        vx_cleanup();
 
         return i;
     }
@@ -899,12 +901,12 @@ struct iPow_SIMD<int, int>
     int operator() ( const int * src, int * dst, int len, int power)
     {
         int i = 0;
-        v_int32x4 v_1 = v_setall_s32(1);
+        v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - 8; i += 8)
+        for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2)
         {
-            v_int32x4 v_a1 = v_1, v_a2 = v_1;
-            v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
+            v_int32 v_a1 = v_1, v_a2 = v_1;
+            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes);
             int p = power;
 
             while( p > 1 )
@@ -923,8 +925,9 @@ struct iPow_SIMD<int, int>
             v_a2 *= v_b2;
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + 4, v_a2);
+            v_store(dst + i + v_int32::nlanes, v_a2);
         }
+        vx_cleanup();
 
         return i;
     }
@@ -936,12 +939,12 @@ struct iPow_SIMD<float, float>
     int operator() ( const float * src, float * dst, int len, int power)
     {
         int i = 0;
-        v_float32x4 v_1 = v_setall_f32(1.f);
+        v_float32 v_1 = vx_setall_f32(1.f);
 
-        for ( ; i <= len - 8; i += 8)
+        for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2)
         {
-            v_float32x4 v_a1 = v_1, v_a2 = v_1;
-            v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4);
+            v_float32 v_a1 = v_1, v_a2 = v_1;
+            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes);
             int p = std::abs(power);
             if( power < 0 )
             {
@@ -965,26 +968,27 @@ struct iPow_SIMD<float, float>
             v_a2 *= v_b2;
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + 4, v_a2);
+            v_store(dst + i + v_float32::nlanes, v_a2);
         }
+        vx_cleanup();
 
         return i;
     }
 };
 
-#if CV_SIMD128_64F
+#if CV_SIMD_64F
 template <>
 struct iPow_SIMD<double, double>
 {
     int operator() ( const double * src, double * dst, int len, int power)
     {
         int i = 0;
-        v_float64x2 v_1 = v_setall_f64(1.);
+        v_float64 v_1 = vx_setall_f64(1.);
 
-        for ( ; i <= len - 4; i += 4)
+        for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2)
         {
-            v_float64x2 v_a1 = v_1, v_a2 = v_1;
-            v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2);
+            v_float64 v_a1 = v_1, v_a2 = v_1;
+            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes);
             int p = std::abs(power);
             if( power < 0 )
             {
@@ -1008,8 +1012,9 @@ struct iPow_SIMD<double, double>
             v_a2 *= v_b2;
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + 2, v_a2);
+            v_store(dst + i + v_float64::nlanes, v_a2);
         }
+        vx_cleanup();
 
         return i;
     }
@@ -1594,9 +1599,9 @@ void patchNaNs( InputOutputArray _a, double _val )
     Cv32suf val;
     val.f = (float)_val;
 
-#if CV_SIMD128
-    v_int32x4 v_mask1 = v_setall_s32(0x7fffffff), v_mask2 = v_setall_s32(0x7f800000);
-    v_int32x4 v_val = v_setall_s32(val.i);
+#if CV_SIMD
+    v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
+    v_int32 v_val = vx_setall_s32(val.i);
 #endif
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -1604,18 +1609,16 @@ void patchNaNs( InputOutputArray _a, double _val )
         int* tptr = ptrs[0];
         size_t j = 0;
 
-#if CV_SIMD128
-        if( hasSIMD128() )
+#if CV_SIMD
+        size_t cWidth = (size_t)v_int32::nlanes;
+        for ( ; j + cWidth <= len; j += cWidth)
         {
-            size_t cWidth = (size_t)v_int32x4::nlanes;
-            for ( ; j + cWidth <= len; j += cWidth)
-            {
-                v_int32x4 v_src = v_load(tptr + j);
-                v_int32x4 v_cmp_mask = v_mask2 < (v_src & v_mask1);
-                v_int32x4 v_dst = v_select(v_cmp_mask, v_val, v_src);
-                v_store(tptr + j, v_dst);
-            }
+            v_int32 v_src = vx_load(tptr + j);
+            v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1);
+            v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
+            v_store(tptr + j, v_dst);
         }
+        vx_cleanup();
 #endif
 
         for( ; j < len; j++ )
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index 7cd89c6..4f85e06 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2310,18 +2310,12 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
 {
     float alpha = *_alpha;
     int i = 0;
-#if CV_SIMD128
-    if (hasSIMD128())
-    {
-        v_float32x4 v_alpha = v_setall_f32(alpha);
-        const int cWidth = v_float32x4::nlanes;
-        for (; i <= len - cWidth; i += cWidth)
-        {
-            v_float32x4 v_src1 = v_load(src1 + i);
-            v_float32x4 v_src2 = v_load(src2 + i);
-            v_store(dst + i, (v_src1 * v_alpha) + v_src2);
-        }
-    }
+#if CV_SIMD
+    v_float32 v_alpha = vx_setall_f32(alpha);
+    const int cWidth = v_float32::nlanes;
+    for (; i <= len - cWidth; i += cWidth)
+        v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
+    vx_cleanup();
 #endif
     for (; i < len; i++)
         dst[i] = src1[i] * alpha + src2[i];
@@ -2333,22 +2327,12 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 {
     double alpha = *_alpha;
     int i = 0;
-#if CV_SIMD128_64F
-    if (hasSIMD128())
-    {
-        v_float64x2 a2 = v_setall_f64(alpha);
-        const int cWidth = v_float64x2::nlanes;
-        for (; i <= len - cWidth * 2; i += cWidth * 2)
-        {
-            v_float64x2 x0, x1, y0, y1, t0, t1;
-            x0 = v_load(src1 + i); x1 = v_load(src1 + i + cWidth);
-            y0 = v_load(src2 + i); y1 = v_load(src2 + i + cWidth);
-            t0 = x0 * a2 + y0;
-            t1 = x1 * a2 + y1;
-            v_store(dst + i, t0);
-            v_store(dst + i + cWidth, t1);
-        }
-    }
+#if CV_SIMD_64F
+    v_float64 a2 = vx_setall_f64(alpha);
+    const int cWidth = v_float64::nlanes;
+    for (; i <= len - cWidth; i += cWidth)
+        v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
+    vx_cleanup();
 #endif
     for (; i < len; i++)
         dst[i] = src1[i] * alpha + src2[i];
@@ -3025,42 +3009,40 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
 #endif
     int i = 0;
 
-#if CV_SIMD128
-    if (hasSIMD128())
+#if CV_SIMD
+    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
+
+    while (i < len0)
     {
-        int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
+        blockSize = std::min(len0 - i, blockSize0);
+        v_int32 v_sum = vx_setzero_s32();
+        const int cWidth = v_uint16::nlanes;
 
-        while (i < len0)
+        int j = 0;
+        for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
         {
-            blockSize = std::min(len0 - i, blockSize0);
-            v_int32x4 v_sum = v_setzero_s32();
-            const int cWidth = v_uint16x8::nlanes;
+            v_uint16 v_src10, v_src20, v_src11, v_src21;
+            v_expand(vx_load(src1 + j), v_src10, v_src11);
+            v_expand(vx_load(src2 + j), v_src20, v_src21);
 
-            int j = 0;
-            for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
-            {
-                v_uint16x8 v_src10, v_src20, v_src11, v_src21;
-                v_expand(v_load(src1 + j), v_src10, v_src11);
-                v_expand(v_load(src2 + j), v_src20, v_src21);
-
-                v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
-                v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
-            }
-
-            for (; j <= blockSize - cWidth; j += cWidth)
-            {
-                v_int16x8 v_src10 = v_reinterpret_as_s16(v_load_expand(src1 + j));
-                v_int16x8 v_src20 = v_reinterpret_as_s16(v_load_expand(src2 + j));
+            v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
+            v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
+        }
 
-                v_sum += v_dotprod(v_src10, v_src20);
-            }
-            r += (double)v_reduce_sum(v_sum);
+        for (; j <= blockSize - cWidth; j += cWidth)
+        {
+            v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
+            v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
 
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
+            v_sum += v_dotprod(v_src10, v_src20);
         }
+        r += (double)v_reduce_sum(v_sum);
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
     }
+    vx_cleanup();
 #elif CV_NEON
     if( cv::checkHardwareSupport(CV_CPU_NEON) )
     {
@@ -3113,42 +3095,40 @@ static double dotProd_8s(const schar* src1, const schar* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD128
-    if (hasSIMD128())
+#if CV_SIMD
+    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
+
+    while (i < len0)
     {
-        int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
+        blockSize = std::min(len0 - i, blockSize0);
+        v_int32 v_sum = vx_setzero_s32();
+        const int cWidth = v_int16::nlanes;
 
-        while (i < len0)
+        int j = 0;
+        for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
         {
-            blockSize = std::min(len0 - i, blockSize0);
-            v_int32x4 v_sum = v_setzero_s32();
-            const int cWidth = v_int16x8::nlanes;
-
-            int j = 0;
-            for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
-            {
-                v_int16x8 v_src10, v_src20, v_src11, v_src21;
-                v_expand(v_load(src1 + j), v_src10, v_src11);
-                v_expand(v_load(src2 + j), v_src20, v_src21);
+            v_int16 v_src10, v_src20, v_src11, v_src21;
+            v_expand(vx_load(src1 + j), v_src10, v_src11);
+            v_expand(vx_load(src2 + j), v_src20, v_src21);
 
-                v_sum += v_dotprod(v_src10, v_src20);
-                v_sum += v_dotprod(v_src11, v_src21);
-            }
-
-            for (; j <= blockSize - cWidth; j += cWidth)
-            {
-                v_int16x8 v_src10 = v_load_expand(src1 + j);
-                v_int16x8 v_src20 = v_load_expand(src2 + j);
+            v_sum += v_dotprod(v_src10, v_src20);
+            v_sum += v_dotprod(v_src11, v_src21);
+        }
 
-                v_sum += v_dotprod(v_src10, v_src20);
-            }
-            r += (double)v_reduce_sum(v_sum);
+        for (; j <= blockSize - cWidth; j += cWidth)
+        {
+            v_int16 v_src10 = vx_load_expand(src1 + j);
+            v_int16 v_src20 = vx_load_expand(src2 + j);
 
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
+            v_sum += v_dotprod(v_src10, v_src20);
         }
+        r += (double)v_reduce_sum(v_sum);
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
     }
+    vx_cleanup();
 #elif CV_NEON
     if( cv::checkHardwareSupport(CV_CPU_NEON) )
     {
@@ -3232,28 +3212,26 @@ static double dotProd_32f(const float* src1, const float* src2, int len)
 #endif
     int i = 0;
 
-#if CV_SIMD128
-    if (hasSIMD128())
-    {
-        int len0 = len & -4, blockSize0 = (1 << 13), blockSize;
+#if CV_SIMD
+    int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
 
-        while (i < len0)
-        {
-            blockSize = std::min(len0 - i, blockSize0);
-            v_float32x4 v_sum = v_setzero_f32();
+    while (i < len0)
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        v_float32 v_sum = vx_setzero_f32();
 
-            int j = 0;
-            int cWidth = v_float32x4::nlanes;
-            for (; j <= blockSize - cWidth; j += cWidth)
-                v_sum = v_muladd(v_load(src1 + j), v_load(src2 + j), v_sum);
+        int j = 0;
+        int cWidth = v_float32::nlanes;
+        for (; j <= blockSize - cWidth; j += cWidth)
+            v_sum = v_muladd(vx_load(src1 + j), vx_load(src2 + j), v_sum);
 
-            r += v_reduce_sum(v_sum);
+        r += v_reduce_sum(v_sum);
 
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
-        }
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
     }
+    vx_cleanup();
 #endif
     return r + dotProd_(src1, src2, len - i);
 }
-- 
2.7.4