From ca1a0a1108d9f4e894bf747cd5f0ee2b36cfc25c Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Fri, 15 Dec 2017 06:00:35 +0900
Subject: [PATCH] core: remove raw SSE2/NEON implementation from convert.cpp
 (#9831)

* remove raw SSE2/NEON implementation from convert.cpp
  * remove raw implementation from Cvt_SIMD
  * remove raw implementation from cvtScale_SIMD
  * remove raw implementation from cvtScaleAbs_SIMD
  * remove duplicated implementation cvt_<float, short>
  * remove duplicated implementation cvtScale_<short, short, float>
  * add "from double" version of Cvt_SIMD
  * modify the condition of test ConvertScaleAbs

* Update convert.cpp

fixed crash in cvtScaleAbs(8s=>8u)

* fixed compile error on Win32

* fixed several test failures because of accuracy loss in cvtScale(int=>int)

* fixed NEON implementation of v_cvt_f64(int=>double) intrinsic

* another attempt to fix test failures

* keep trying to fix the test failures and just introduced compile warnings

* fixed one remaining test (subtractScalar)
---
 modules/core/perf/perf_arithm.cpp |   24 +-
 modules/core/src/convert.cpp      | 3613 +++++++++++++------------------------
 modules/core/test/test_arithm.cpp |    5 +
 3 files changed, 1297 insertions(+), 2345 deletions(-)

diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp
index c6c2a1b..310cc0b 100644
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@@ -80,16 +80,18 @@ PERF_TEST_P(Size_MatType, absdiff, TYPICAL_MATS_CORE_ARITHM)
 
     declare.in(a, b, WARMUP_RNG).out(c);
 
+    double eps = 1e-8;
     if (CV_MAT_DEPTH(type) == CV_32S)
     {
         //see ticket 1529: absdiff can be without saturation on 32S
         a /= 2;
         b /= 2;
+        eps = 1;
     }
 
     TEST_CYCLE() absdiff(a, b, c);
 
-    SANITY_CHECK(c, 1e-8);
+    SANITY_CHECK(c, eps);
 }
 
 PERF_TEST_P(Size_MatType, absdiffScalar, TYPICAL_MATS_CORE_ARITHM)
@@ -102,16 +104,18 @@ PERF_TEST_P(Size_MatType, absdiffScalar, TYPICAL_MATS_CORE_ARITHM)
 
     declare.in(a, b, WARMUP_RNG).out(c);
 
+    double eps = 1e-8;
     if (CV_MAT_DEPTH(type) == CV_32S)
     {
         //see ticket 1529: absdiff can be without saturation on 32S
         a /= 2;
         b /= 2;
+        eps = 1;
     }
 
     TEST_CYCLE() absdiff(a, b, c);
 
-    SANITY_CHECK(c, 1e-8);
+    SANITY_CHECK(c, eps);
 }
 
 PERF_TEST_P(Size_MatType, add, TYPICAL_MATS_CORE_ARITHM)
@@ -125,16 +129,18 @@ PERF_TEST_P(Size_MatType, add, TYPICAL_MATS_CORE_ARITHM)
     declare.in(a, b, WARMUP_RNG).out(c);
     declare.time(50);
 
+    double eps = 1e-8;
     if (CV_MAT_DEPTH(type) == CV_32S)
     {
         //see ticket 1529: add can be without saturation on 32S
         a /= 2;
         b /= 2;
+        eps = 1;
     }
 
     TEST_CYCLE() add(a, b, c);
 
-    SANITY_CHECK(c, 1e-8);
+    SANITY_CHECK(c, eps);
 }
 
 PERF_TEST_P(Size_MatType, addScalar, TYPICAL_MATS_CORE_ARITHM)
@@ -147,16 +153,18 @@ PERF_TEST_P(Size_MatType, addScalar, TYPICAL_MATS_CORE_ARITHM)
 
     declare.in(a, b, WARMUP_RNG).out(c);
 
+    double eps = 1e-8;
     if (CV_MAT_DEPTH(type) == CV_32S)
     {
         //see ticket 1529: add can be without saturation on 32S
         a /= 2;
         b /= 2;
+        eps = 1;
     }
 
     TEST_CYCLE() add(a, b, c);
 
-    SANITY_CHECK(c, 1e-8);
+    SANITY_CHECK(c, eps);
 }
 
 PERF_TEST_P(Size_MatType, subtract, TYPICAL_MATS_CORE_ARITHM)
@@ -169,16 +177,18 @@ PERF_TEST_P(Size_MatType, subtract, TYPICAL_MATS_CORE_ARITHM)
 
     declare.in(a, b, WARMUP_RNG).out(c);
 
+    double eps = 1e-8;
     if (CV_MAT_DEPTH(type) == CV_32S)
     {
         //see ticket 1529: subtract can be without saturation on 32S
         a /= 2;
         b /= 2;
+        eps = 1;
     }
 
     TEST_CYCLE() subtract(a, b, c);
 
-    SANITY_CHECK(c, 1e-8);
+    SANITY_CHECK(c, eps);
 }
 
 PERF_TEST_P(Size_MatType, subtractScalar, TYPICAL_MATS_CORE_ARITHM)
@@ -191,16 +201,18 @@ PERF_TEST_P(Size_MatType, subtractScalar, TYPICAL_MATS_CORE_ARITHM)
 
     declare.in(a, b, WARMUP_RNG).out(c);
 
+    double eps = 1e-8;
     if (CV_MAT_DEPTH(type) == CV_32S)
     {
         //see ticket 1529: subtract can be without saturation on 32S
         a /= 2;
         b /= 2;
+        eps = 1;
     }
 
     TEST_CYCLE() subtract(a, b, c);
 
-    SANITY_CHECK(c, 1e-8);
+    SANITY_CHECK(c, eps);
 }
 
 PERF_TEST_P(Size_MatType, multiply, TYPICAL_MATS_CORE_ARITHM)
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index dfc2f2d..0a2b158 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -960,41 +960,77 @@ struct cvtScaleAbs_SIMD
     }
 };
 
-#if CV_SSE2
+#if CV_SIMD128
+
+static inline void v_load_expand_from_u8_f32(const uchar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+    v_uint32x4 v_src0, v_src1;
+    v_expand(v_load_expand(src), v_src0, v_src1);
+
+    a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
+    b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
+}
+
+static inline void v_load_expand_from_s8_f32(const schar* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+    v_int32x4 v_src0, v_src1;
+    v_expand(v_load_expand(src), v_src0, v_src1);
+
+    a = v_shift + v_scale * v_cvt_f32(v_src0);
+    b = v_shift + v_scale * v_cvt_f32(v_src1);
+}
+
+static inline void v_load_expand_from_u16_f32(const ushort* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+    v_uint32x4 v_src0, v_src1;
+    v_expand(v_load(src), v_src0, v_src1);
+
+    a = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src0));
+    b = v_shift + v_scale * v_cvt_f32(v_reinterpret_as_s32(v_src1));
+}
+
+static inline void v_load_expand_from_s16_f32(const short* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+    v_int32x4 v_src0, v_src1;
+    v_expand(v_load(src), v_src0, v_src1);
+
+    a = v_shift + v_scale * v_cvt_f32(v_src0);
+    b = v_shift + v_scale * v_cvt_f32(v_src1);
+}
+
+static inline void v_load_expand_from_s32_f32(const int* src, const v_float32x4 &v_scale, const v_float32x4 &v_shift, v_float32x4 &a, v_float32x4 &b)
+{
+    a = v_shift + v_scale * v_cvt_f32(v_load(src));
+    b = v_shift + v_scale * v_cvt_f32(v_load(src + v_int32x4::nlanes));
+}
 
 template <>
 struct cvtScaleAbs_SIMD<uchar, uchar, float>
 {
     int operator () (const uchar * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
-
-        if (USE_SSE2)
+        if (hasSIMD128())
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
-
-            for ( ; x <= width - 16; x += 16)
+            v_float32x4 v_shift = v_setall_f32(shift);
+            v_float32x4 v_scale = v_setall_f32(scale);
+            const int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
             {
-                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
-                __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
-                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
-                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
-                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
-                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
-                __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
-                v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
-                __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
-                v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
-
-                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
-                                                   _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
+                v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
+                v_load_expand_from_u8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
+                v_dst_0 = v_abs(v_dst_0);
+                v_dst_1 = v_abs(v_dst_1);
+                v_dst_2 = v_abs(v_dst_2);
+                v_dst_3 = v_abs(v_dst_3);
+
+                v_int16x8 v_dsti_0 = v_pack(v_round(v_dst_0), v_round(v_dst_1));
+                v_int16x8 v_dsti_1 = v_pack(v_round(v_dst_2), v_round(v_dst_3));
+                v_store(dst + x, v_pack_u(v_dsti_0, v_dsti_1));
             }
         }
-
         return x;
     }
 };
@@ -1003,40 +1039,29 @@ template <>
 struct cvtScaleAbs_SIMD<schar, uchar, float>
 {
     int operator () (const schar * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
-
-        if (USE_SSE2)
+        if (hasSIMD128())
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
-
-            for ( ; x <= width - 16; x += 16)
+            v_float32x4 v_shift = v_setall_f32(shift);
+            v_float32x4 v_scale = v_setall_f32(scale);
+            const int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth*2; x += cWidth*2)
             {
-                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
-                __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
-                        v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
-                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
-                    _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
-                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
-                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
-                    _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
-                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
-                __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
-                    _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
-                v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
-                __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
-                    _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
-                v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
-
-                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
-                                                   _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
+                v_float32x4 v_dst_0, v_dst_1, v_dst_2, v_dst_3;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_dst_0, v_dst_1);
+                v_load_expand_from_s8_f32(src + x + cWidth, v_scale, v_shift, v_dst_2, v_dst_3);
+                v_dst_0 = v_abs(v_dst_0);
+                v_dst_1 = v_abs(v_dst_1);
+                v_dst_2 = v_abs(v_dst_2);
+                v_dst_3 = v_abs(v_dst_3);
+
+                v_uint16x8 v_dsti_0 = v_pack_u(v_round(v_dst_0), v_round(v_dst_1));
+                v_uint16x8 v_dsti_1 = v_pack_u(v_round(v_dst_2), v_round(v_dst_3));
+                v_store(dst + x, v_pack(v_dsti_0, v_dsti_1));
             }
         }
-
         return x;
     }
 };
@@ -1045,29 +1070,25 @@ template <>
 struct cvtScaleAbs_SIMD<ushort, uchar, float>
 {
     int operator () (const ushort * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
-
-        if (USE_SSE2)
+        if (hasSIMD128())
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
-
-            for ( ; x <= width - 8; x += 8)
+            v_float32x4 v_shift = v_setall_f32(shift);
+            v_float32x4 v_scale = v_setall_f32(scale);
+            const int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
             {
-                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
-                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
-                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
-                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
-                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
-
-                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
-                _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+                v_float32x4 v_dst0, v_dst1;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
+                v_dst0 = v_abs(v_dst0);
+                v_dst1 = v_abs(v_dst1);
+
+                v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
+                v_pack_u_store(dst + x, v_dst);
             }
         }
-
         return x;
     }
 };
@@ -1076,29 +1097,25 @@ template <>
 struct cvtScaleAbs_SIMD<short, uchar, float>
 {
     int operator () (const short * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
-
-        if (USE_SSE2)
+        if (hasSIMD128())
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
-
-            for ( ; x <= width - 8; x += 8)
+            v_float32x4 v_shift = v_setall_f32(shift);
+            v_float32x4 v_scale = v_setall_f32(scale);
+            const int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
             {
-                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
-                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
-                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
-                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
-                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
-
-                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
-                _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
+                v_float32x4 v_dst0, v_dst1;
+                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_dst0, v_dst1);
+                v_dst0 = v_abs(v_dst0);
+                v_dst1 = v_abs(v_dst1);
+
+                v_int16x8 v_dst = v_pack(v_round(v_dst0), v_round(v_dst1));
+                v_pack_u_store(dst + x, v_dst);
             }
         }
-
         return x;
     }
 };
@@ -1107,25 +1124,22 @@ template <>
 struct cvtScaleAbs_SIMD<int, uchar, float>
 {
     int operator () (const int * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
-
-        if (USE_SSE2)
+        v_float32x4 v_shift = v_setall_f32(shift);
+        v_float32x4 v_scale = v_setall_f32(scale);
+        const int cWidth = v_int32x4::nlanes;
+        for (; x <= width - cWidth * 2; x += cWidth * 2)
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
+            v_float32x4 v_dst_0 = v_cvt_f32(v_load(src + x)) * v_scale;
+            v_dst_0 = v_abs(v_dst_0 + v_shift);
 
-            for ( ; x <= width - 8; x += 4)
-            {
-                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
-                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+            v_float32x4 v_dst_1 = v_cvt_f32(v_load(src + x + cWidth)) * v_scale;
+            v_dst_1 = v_abs(v_dst_1 + v_shift);
 
-                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
-                _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
-            }
+            v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
+            v_pack_u_store(dst + x, v_dst);
         }
 
         return x;
@@ -1136,273 +1150,59 @@ template <>
 struct cvtScaleAbs_SIMD<float, uchar, float>
 {
     int operator () (const float * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
-
-        if (USE_SSE2)
+        v_float32x4 v_shift = v_setall_f32(shift);
+        v_float32x4 v_scale = v_setall_f32(scale);
+        int cWidth = v_float32x4::nlanes;
+        for (; x <= width - cWidth * 2; x += cWidth * 2)
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
+            v_float32x4 v_dst_0 = v_load(src + x) * v_scale;
+            v_dst_0 = v_abs(v_dst_0 + v_shift);
 
-            for ( ; x <= width - 8; x += 4)
-            {
-                __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
-                v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
+            v_float32x4 v_dst_1 = v_load(src + x + cWidth) * v_scale;
+            v_dst_1 = v_abs(v_dst_1 + v_shift);
 
-                __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
-            }
+            v_int16x8 v_dst = v_pack(v_round(v_dst_0), v_round(v_dst_1));
+            v_pack_u_store(dst + x, v_dst);
         }
-
         return x;
     }
 };
 
+#if CV_SIMD128_64F
 template <>
 struct cvtScaleAbs_SIMD<double, uchar, float>
 {
     int operator () (const double * src, uchar * dst, int width,
-                     float scale, float shift) const
+        float scale, float shift) const
     {
         int x = 0;
 
-        if (USE_SSE2)
+        if (hasSIMD128())
         {
-            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
-                v_zero_f = _mm_setzero_ps();
-            __m128i v_zero_i = _mm_setzero_si128();
-
-            for ( ; x <= width - 8; x += 8)
+            v_float32x4 v_scale = v_setall_f32(scale);
+            v_float32x4 v_shift = v_setall_f32(shift);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
             {
-                __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
-                                              _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
-                __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
-                                              _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-
-                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
-                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+                v_float32x4 v_src1, v_src2, v_dummy;
+                v_recombine(v_cvt_f32(v_load(src + x)), v_cvt_f32(v_load(src + x + cWidth)), v_src1, v_dummy);
+                v_recombine(v_cvt_f32(v_load(src + x + cWidth * 2)), v_cvt_f32(v_load(src + x + cWidth * 3)), v_src2, v_dummy);
 
-                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
-                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+                v_float32x4 v_dst1 = v_abs((v_src1 * v_scale) + v_shift);
+                v_float32x4 v_dst2 = v_abs((v_src2 * v_scale) + v_shift);
 
-                __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
-                                                  _mm_cvtps_epi32(v_dst2));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
+                v_int16x8 v_dst_i = v_pack(v_round(v_dst1), v_round(v_dst2));
+                v_pack_u_store(dst + x, v_dst_i);
             }
         }
 
         return x;
     }
 };
-
-#elif CV_NEON
-
-template <>
-struct cvtScaleAbs_SIMD<uchar, uchar, float>
-{
-    int operator () (const uchar * src, uchar * dst, int width,
-                     float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift);
-
-        for ( ; x <= width - 16; x += 16)
-        {
-            uint8x16_t v_src = vld1q_u8(src + x);
-            uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
-
-            uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
-            float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
-            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
-            v_quat = vmovl_u16(vget_high_u16(v_half));
-            float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
-            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
-            v_half = vmovl_u8(vget_high_u8(v_src));
-
-            v_quat = vmovl_u16(vget_low_u16(v_half));
-            float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
-            v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
-
-            v_quat = vmovl_u16(vget_high_u16(v_half));
-            float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
-            v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
-
-            uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
-                vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
-            uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
-                vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
-
-            vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<schar, uchar, float>
-{
-    int operator () (const schar * src, uchar * dst, int width,
-                     float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift);
-
-        for ( ; x <= width - 16; x += 16)
-        {
-            int8x16_t v_src = vld1q_s8(src + x);
-            int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
-
-            int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
-            float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
-            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
-            v_quat = vmovl_s16(vget_high_s16(v_half));
-            float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
-            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
-            v_half = vmovl_s8(vget_high_s8(v_src));
-
-            v_quat = vmovl_s16(vget_low_s16(v_half));
-            float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
-            v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
-
-            v_quat = vmovl_s16(vget_high_s16(v_half));
-            float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
-            v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
-
-            uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
-                vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
-            uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
-                vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
-
-            vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<ushort, uchar, float>
-{
-    int operator () (const ushort * src, uchar * dst, int width,
-                     float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            uint16x8_t v_src = vld1q_u16(src + x);
-
-            uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
-            float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
-            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
-            v_half = vmovl_u16(vget_high_u16(v_src));
-            float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
-            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
-                vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
-
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<short, uchar, float>
-{
-    int operator () (const short * src, uchar * dst, int width,
-                     float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            int16x8_t v_src = vld1q_s16(src + x);
-
-            int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
-            float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
-            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-
-            v_half = vmovl_s16(vget_high_s16(v_src));
-            float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
-            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
-                vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
-
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<int, uchar, float>
-{
-    int operator () (const int * src, uchar * dst, int width,
-                     float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
-            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-            uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
-
-            float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
-            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-            uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
-
-            uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScaleAbs_SIMD<float, uchar, float>
-{
-    int operator () (const float * src, uchar * dst, int width,
-                     float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
-            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
-            uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
-
-            float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
-            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
-            uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
-
-            uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
-        }
-
-        return x;
-    }
-};
+#endif // CV_SIMD128_64F
 
 #endif
 
@@ -1445,7 +1245,7 @@ struct cvtScale_SIMD
     }
 };
 
-#if CV_SSE2
+#if CV_SIMD128
 
 // from uchar
 
@@ -1455,27 +1255,19 @@ struct cvtScale_SIMD<uchar, uchar, float>
     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_pack_u_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -1486,81 +1278,69 @@ struct cvtScale_SIMD<uchar, schar, float>
     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<uchar, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_u8u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
-    }
+#endif
+        if (hasSIMD128())
+        {
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-    bool haveSSE;
+                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
+        }
+        return x;
+    }
 };
 
-#endif
-
 template <>
 struct cvtScale_SIMD<uchar, short, float>
 {
     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -1571,26 +1351,19 @@ struct cvtScale_SIMD<uchar, int, float>
     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
-            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+                v_store(dst + x, v_round(v_src1));
+                v_store(dst + x + cWidth, v_round(v_src2));
+            }
         }
-
         return x;
     }
 };
@@ -1601,60 +1374,19 @@ struct cvtScale_SIMD<uchar, float, float>
     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            _mm_storeu_ps(dst + x, v_dst_0);
-            _mm_storeu_ps(dst + x + 4, v_dst_1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<uchar, double, double>
-{
-    int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
-
-            __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x, v_dst_0);
-            _mm_storeu_pd(dst + x + 2, v_dst_1);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
-            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x + 4, v_dst_0);
-            _mm_storeu_pd(dst + x + 6, v_dst_1);
+                v_store(dst + x, v_src1);
+                v_store(dst + x + cWidth, v_src2);
+            }
         }
-
         return x;
     }
 };
@@ -1667,27 +1399,19 @@ struct cvtScale_SIMD<schar, uchar, float>
     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_pack_u_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -1698,81 +1422,69 @@ struct cvtScale_SIMD<schar, schar, float>
     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<schar, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_s8u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
-    }
+#endif
+        if (hasSIMD128())
+        {
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-    bool haveSSE;
+                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
+        }
+        return x;
+    }
 };
 
-#endif
-
 template <>
 struct cvtScale_SIMD<schar, short, float>
 {
     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -1783,26 +1495,19 @@ struct cvtScale_SIMD<schar, int, float>
     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
-            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+                v_store(dst + x, v_round(v_src1));
+                v_store(dst + x + cWidth, v_round(v_src2));
+            }
         }
-
         return x;
     }
 };
@@ -1813,61 +1518,19 @@ struct cvtScale_SIMD<schar, float, float>
     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            _mm_storeu_ps(dst + x, v_dst_0);
-            _mm_storeu_ps(dst + x + 4, v_dst_1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, double, double>
-{
-    int operator () (const schar * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
-            v_src = _mm_srai_epi16(v_src, 8);
-
-            __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x, v_dst_0);
-            _mm_storeu_pd(dst + x + 2, v_dst_1);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s8_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
-            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x + 4, v_dst_0);
-            _mm_storeu_pd(dst + x + 6, v_dst_1);
+                v_store(dst + x, v_src1);
+                v_store(dst + x + cWidth, v_src2);
+            }
         }
-
         return x;
     }
 };
@@ -1880,27 +1543,19 @@ struct cvtScale_SIMD<ushort, uchar, float>
     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_pack_u_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -1911,81 +1566,69 @@ struct cvtScale_SIMD<ushort, schar, float>
     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<ushort, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_u16u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
-    }
+#endif
+        if (hasSIMD128())
+        {
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-    bool haveSSE;
+                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
+        }
+        return x;
+    }
 };
 
-#endif
-
 template <>
 struct cvtScale_SIMD<ushort, short, float>
 {
     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -1996,26 +1639,19 @@ struct cvtScale_SIMD<ushort, int, float>
     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
-            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+                v_store(dst + x, v_round(v_src1));
+                v_store(dst + x + cWidth, v_round(v_src2));
+            }
         }
-
         return x;
     }
 };
@@ -2026,60 +1662,19 @@ struct cvtScale_SIMD<ushort, float, float>
     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            _mm_storeu_ps(dst + x, v_dst_0);
-            _mm_storeu_ps(dst + x + 4, v_dst_1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<ushort, double, double>
-{
-    int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-
-            __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x, v_dst_0);
-            _mm_storeu_pd(dst + x + 2, v_dst_1);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_u16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
-            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x + 4, v_dst_0);
-            _mm_storeu_pd(dst + x + 6, v_dst_1);
+                v_store(dst + x, v_src1);
+                v_store(dst + x + cWidth, v_src2);
+            }
         }
-
         return x;
     }
 };
@@ -2092,27 +1687,19 @@ struct cvtScale_SIMD<short, uchar, float>
     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_pack_u_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -2123,111 +1710,69 @@ struct cvtScale_SIMD<short, schar, float>
     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<short, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_s16u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
-    }
-
-    bool haveSSE;
-};
-
 #endif
-
-template <>
-struct cvtScale_SIMD<short, short, float>
-{
-    int operator () (const short * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<short, int, float>
+struct cvtScale_SIMD<short, short, float>
 {
-    int operator () (const short * src, int * dst, int width, float scale, float shift) const
+    int operator () (const short * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
-            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -2238,60 +1783,19 @@ struct cvtScale_SIMD<short, float, float>
     int operator () (const short * src, float * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
-
-            _mm_storeu_ps(dst + x, v_dst_0);
-            _mm_storeu_ps(dst + x + 4, v_dst_1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<short, double, double>
-{
-    int operator () (const short * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-
-            __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x, v_dst_0);
-            _mm_storeu_pd(dst + x + 2, v_dst_1);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s16_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
-            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
-            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
-            _mm_storeu_pd(dst + x + 4, v_dst_0);
-            _mm_storeu_pd(dst + x + 6, v_dst_1);
+                v_store(dst + x, v_src1);
+                v_store(dst + x + cWidth, v_src2);
+            }
         }
-
         return x;
     }
 };
@@ -2304,26 +1808,19 @@ struct cvtScale_SIMD<int, uchar, float>
     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_pack_u_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -2334,108 +1831,92 @@ struct cvtScale_SIMD<int, schar, float>
     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<int, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_s32u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
-    }
+#endif
+        if (hasSIMD128())
+        {
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-    bool haveSSE;
+                v_uint16x8 v_dst = v_pack_u(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
+        }
+        return x;
+    }
 };
 
-#endif
-
 template <>
 struct cvtScale_SIMD<int, short, float>
 {
     int operator () (const int * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
-
-            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_src1, v_src2;
+                v_load_expand_from_s32_f32(src + x, v_scale, v_shift, v_src1, v_src2);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+                v_int16x8 v_dst = v_pack(v_round(v_src1), v_round(v_src2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
 
+#if CV_SIMD128_64F
 template <>
 struct cvtScale_SIMD<int, int, double>
 {
     int operator () (const int * src, int * dst, int width, double scale, double shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
-            v_src = _mm_srli_si128(v_src, 8);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
-            __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
-                                         _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
+                v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
+                v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
+                v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
+            }
         }
-
         return x;
     }
 };
@@ -2446,84 +1927,44 @@ struct cvtScale_SIMD<int, float, double>
     int operator () (const int * src, float * dst, int width, double scale, double shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
-            v_src = _mm_srli_si128(v_src, 8);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
-            _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
-                                                 _mm_cvtpd_ps(v_dst_1)));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                double v_srcbuf[] = { (double)src[x], (double)src[x+1], (double)src[x+2], (double)src[x+3] };
+                v_float64x2 v_src1 = v_shift + v_scale * v_load(v_srcbuf);
+                v_float64x2 v_src2 = v_shift + v_scale * v_load(v_srcbuf + 2);
+                v_store(dst + x, v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2)));
+            }
         }
-
         return x;
     }
 };
+#endif //CV_SIMD128_64F
+
+// from float
 
 template <>
-struct cvtScale_SIMD<int, double, double>
+struct cvtScale_SIMD<float, uchar, float>
 {
-    int operator () (const int * src, double * dst, int width, double scale, double shift) const
+    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
-
-            v_src = _mm_srli_si128(v_src, 8);
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
 
-            _mm_storeu_pd(dst + x, v_dst_0);
-            _mm_storeu_pd(dst + x + 2, v_dst_1);
+                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+                v_pack_u_store(dst + x, v_dst);
+            }
         }
-
-        return x;
-    }
-};
-
-// from float
-
-template <>
-struct cvtScale_SIMD<float, uchar, float>
-{
-    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            __m128 v_src = _mm_loadu_ps(src + x);
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_loadu_ps(src + x + 4);
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
-        }
-
         return x;
     }
 };
@@ -2534,78 +1975,69 @@ struct cvtScale_SIMD<float, schar, float>
     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_loadu_ps(src + x);
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_loadu_ps(src + x + 4);
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<float, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_f32u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
-    }
+#endif
+        if (hasSIMD128())
+        {
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
 
-    bool haveSSE;
+                v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
+                v_store(dst + x, v_dst);
+            }
+        }
+        return x;
+    }
 };
 
-#endif
-
 template <>
 struct cvtScale_SIMD<float, short, float>
 {
     int operator () (const float * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_loadu_ps(src + x);
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_loadu_ps(src + x + 4);
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_dst1 = v_shift + v_scale * v_load(src + x);
+                v_float32x4 v_dst2 = v_shift + v_scale * v_load(src + x + cWidth);
 
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -2616,24 +2048,13 @@ struct cvtScale_SIMD<float, int, float>
     int operator () (const float * src, int * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_loadu_ps(src + x);
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_loadu_ps(src + x + 4);
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
-            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_store(dst + x, v_round(v_load(src + x) * v_scale + v_shift));
         }
-
         return x;
     }
 };
@@ -2644,49 +2065,55 @@ struct cvtScale_SIMD<float, float, float>
     int operator () (const float * src, float * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_loadu_ps(src + x);
-            __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-            _mm_storeu_ps(dst + x, v_dst);
+            v_float32x4 v_shift = v_setall_f32(shift), v_scale = v_setall_f32(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_store(dst + x, v_load(src + x) * v_scale + v_shift);
         }
-
         return x;
     }
 };
 
-template <>
-struct cvtScale_SIMD<float, double, double>
-{
-    int operator () (const float * src, double * dst, int width, double scale, double shift) const
-    {
-        int x = 0;
+#if CV_SIMD128_64F
 
-        if (!USE_SSE2)
-            return x;
+static inline void v_load_scale_shift(const double* src, const v_float64x2& v_scale, const v_float64x2 &v_shift, v_float32x4& v_dst1, v_float32x4 &v_dst2)
+{
+    int cWidth = v_float64x2::nlanes;
+    v_float64x2 v_src1 = v_shift + v_scale * v_load(src);
+    v_float64x2 v_src2 = v_shift + v_scale * v_load(src + cWidth);
+    v_float64x2 v_src3 = v_shift + v_scale * v_load(src + cWidth * 2);
+    v_float64x2 v_src4 = v_shift + v_scale * v_load(src + cWidth * 3);
+    v_dst1 = v_combine_low(v_cvt_f32(v_src1), v_cvt_f32(v_src2));
+    v_dst2 = v_combine_low(v_cvt_f32(v_src3), v_cvt_f32(v_src4));
+}
 
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+static inline void v_store_scale_shift_s32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_int32x4 &v1, const v_int32x4 &v2)
+{
+    v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
+    v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
+    v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
+    v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
 
-        for ( ; x <= width - 4; x += 4)
-        {
-            __m128 v_src = _mm_loadu_ps(src + x);
-            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
-            v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
-            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
+    v_store(dst, v_dst1);
+    v_store(dst + v_float64x2::nlanes, v_dst2);
+    v_store(dst + v_float64x2::nlanes * 2, v_dst3);
+    v_store(dst + v_float64x2::nlanes * 3, v_dst4);
+}
 
-            _mm_storeu_pd(dst + x, v_dst_0);
-            _mm_storeu_pd(dst + x + 2, v_dst_1);
-        }
+static inline void v_store_scale_shift_f32_to_f64(double *dst, const v_float64x2 &v_scale, const v_float64x2 &v_shift, const v_float32x4 &v1, const v_float32x4 &v2)
+{
+    v_float64x2 v_dst1 = v_shift + v_scale * v_cvt_f64(v1);
+    v_float64x2 v_dst2 = v_shift + v_scale * v_cvt_f64_high(v1);
+    v_float64x2 v_dst3 = v_shift + v_scale * v_cvt_f64(v2);
+    v_float64x2 v_dst4 = v_shift + v_scale * v_cvt_f64_high(v2);
 
-        return x;
-    }
-};
+    v_store(dst, v_dst1);
+    v_store(dst + v_float64x2::nlanes, v_dst2);
+    v_store(dst + v_float64x2::nlanes * 2, v_dst3);
+    v_store(dst + v_float64x2::nlanes * 3, v_dst4);
+}
 
 // from double
 
@@ -2696,28 +2123,17 @@ struct cvtScale_SIMD<double, uchar, float>
     int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
-                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
-                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_float32x4 v_dst1, v_dst2;
+                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+                v_pack_u_store(dst + x, v_pack(v_round(v_dst1), v_round(v_dst2)));
+            }
         }
-
         return x;
     }
 };
@@ -2728,82 +2144,66 @@ struct cvtScale_SIMD<double, schar, float>
     int operator () (const double * src, schar * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
-                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
-                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_float32x4 v_dst1, v_dst2;
+                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+                v_pack_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
 template <>
 struct cvtScale_SIMD<double, ushort, float>
 {
-    cvtScale_SIMD()
-    {
-        haveSSE = CV_CPU_HAS_SUPPORT_SSE4_1;
-    }
-
     int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
     {
-        if (haveSSE)
+        int x = 0;
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
             return opt_SSE4_1::cvtScale_SIMD_f64u16f32_SSE41(src, dst, width, scale, shift);
-        else
-            return 0;
+#endif
+        if (hasSIMD128())
+        {
+            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_dst1, v_dst2;
+                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+                v_uint16x8 v_dst = v_pack_u(v_round(v_dst1), v_round(v_dst2));
+                v_store(dst + x, v_dst);
+            }
+        }
+        return x;
     }
-
-    bool haveSSE;
 };
 
-#endif
-
 template <>
 struct cvtScale_SIMD<double, short, float>
 {
     int operator () (const double * src, short * dst, int width, float scale, float shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
-                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
-            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
-                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
-            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
-                                            _mm_cvtps_epi32(v_dst_1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+            v_float64x2 v_shift = v_setall_f64((double)shift), v_scale = v_setall_f64((double)scale);
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_float32x4 v_dst1, v_dst2;
+                v_load_scale_shift(src + x, v_scale, v_shift, v_dst1, v_dst2);
+                v_int16x8 v_dst = v_pack(v_round(v_dst1), v_round(v_dst2));
+                v_store(dst + x, v_dst);
+            }
         }
-
         return x;
     }
 };
@@ -2814,26 +2214,18 @@ struct cvtScale_SIMD<double, int, double>
     int operator () (const double * src, int * dst, int width, double scale, double shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128d v_src = _mm_loadu_pd(src + x);
-            __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
-            v_src = _mm_loadu_pd(src + x + 2);
-            __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
-            __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
-                                         _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
+                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
 
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
+                v_store(dst + x, v_combine_low(v_round(v_src1), v_round(v_src2)));
+            }
         }
-
         return x;
     }
 };
@@ -2844,1597 +2236,1182 @@ struct cvtScale_SIMD<double, float, double>
     int operator () (const double * src, float * dst, int width, double scale, double shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128d v_src = _mm_loadu_pd(src + x);
-            __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
-            v_src = _mm_loadu_pd(src + x + 2);
-            __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-
-            __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
-                                         _mm_cvtpd_ps(v_dst1));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
+                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
+                v_float32x4 v_dst1 = v_cvt_f32(v_src1);
+                v_float32x4 v_dst2 = v_cvt_f32(v_src2);
 
-            _mm_storeu_ps(dst + x, v_dst);
+                v_store(dst + x, v_combine_low(v_dst1, v_dst2));
+            }
         }
-
         return x;
     }
 };
 
+// to double
+
 template <>
-struct cvtScale_SIMD<double, double, double>
+struct cvtScale_SIMD<uchar, double, double>
 {
-    int operator () (const double * src, double * dst, int width, double scale, double shift) const
+    int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
-
-        for ( ; x <= width - 2; x += 2)
+        if (hasSIMD128())
         {
-            __m128d v_src = _mm_loadu_pd(src + x);
-            __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
-            _mm_storeu_pd(dst + x, v_dst);
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_load_expand(src + x), v_src1, v_src2);
+                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
+                    , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
+            }
         }
-
         return x;
     }
 };
 
-#elif CV_NEON
-
-// from uchar
-
 template <>
-struct cvtScale_SIMD<uchar, uchar, float>
+struct cvtScale_SIMD<schar, double, double>
 {
-    int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
+    int operator () (const schar * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_load_expand(src + x), v_src1, v_src2);
+                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<uchar, schar, float>
+struct cvtScale_SIMD<ushort, double, double>
 {
-    int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
+    int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1_s8(dst + x, vqmovn_s16(v_dst));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_load(src + x), v_src1, v_src2);
+                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift
+                    , v_reinterpret_as_s32(v_src1), v_reinterpret_as_s32(v_src2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<uchar, ushort, float>
+struct cvtScale_SIMD<short, double, double>
 {
-    int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
+    int operator () (const short * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1q_u16(dst + x, v_dst);
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_load(src + x), v_src1, v_src2);
+                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<uchar, short, float>
+struct cvtScale_SIMD<int, double, double>
 {
-    int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
+    int operator () (const int * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1q_s16(dst + x, v_dst);
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src1 = v_load(src + x);
+                v_int32x4 v_src2 = v_load(src + x + cWidth);
+                v_store_scale_shift_s32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<uchar, int, float>
+struct cvtScale_SIMD<float, double, double>
 {
-    int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
+    int operator () (const float * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
-            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src1 = v_load(src + x);
+                v_float32x4 v_src2 = v_load(src + x + cWidth);
+                v_store_scale_shift_f32_to_f64(dst + x, v_scale, v_shift, v_src1, v_src2);
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<uchar, float, float>
+struct cvtScale_SIMD<double, double, double>
 {
-    int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
+    int operator () (const double * src, double * dst, int width, double scale, double shift) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
-            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
+            v_float64x2 v_shift = v_setall_f64(shift), v_scale = v_setall_f64(scale);
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float64x2 v_src1 = v_shift + v_scale * v_load(src + x);
+                v_float64x2 v_src2 = v_shift + v_scale * v_load(src + x + cWidth);
+                v_store(dst + x, v_src1);
+                v_store(dst + x + cWidth, v_src2);
+            }
         }
-
         return x;
     }
 };
+#endif
+#endif
 
-// from schar
-
-template <>
-struct cvtScale_SIMD<schar, uchar, float>
+template<typename T, typename DT, typename WT> static void
+cvtScale_( const T* src, size_t sstep,
+           DT* dst, size_t dstep, Size size,
+           WT scale, WT shift )
 {
-    int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    cvtScale_SIMD<T, DT, WT> vop;
+
+    for( ; size.height--; src += sstep, dst += dstep )
     {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+        int x = vop(src, dst, size.width, scale, shift);
 
-        for ( ; x <= width - 8; x += 8)
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= size.width - 4; x += 4 )
         {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
+            DT t0, t1;
+            t0 = saturate_cast<DT>(src[x]*scale + shift);
+            t1 = saturate_cast<DT>(src[x+1]*scale + shift);
+            dst[x] = t0; dst[x+1] = t1;
+            t0 = saturate_cast<DT>(src[x+2]*scale + shift);
+            t1 = saturate_cast<DT>(src[x+3]*scale + shift);
+            dst[x+2] = t0; dst[x+3] = t1;
         }
+        #endif
 
-        return x;
+        for( ; x < size.width; x++ )
+            dst[x] = saturate_cast<DT>(src[x]*scale + shift);
     }
-};
+}
 
-template <>
-struct cvtScale_SIMD<schar, schar, float>
+template<> void
+cvtScale_<short, int, float>( const short* src, size_t sstep,
+           int* dst, size_t dstep, Size size,
+           float scale, float shift )
 {
-    int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+
+    for( ; size.height--; src += sstep, dst += dstep )
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        #if CV_TRY_AVX2
+        if (CV_CPU_HAS_SUPPORT_AVX2)
         {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1_s8(dst + x, vqmovn_s16(v_dst));
+            opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width);
+            continue;
         }
+        #endif
+        #if CV_SIMD128
+        if (hasSIMD128())
+        {
+            v_float32x4 v_shift = v_setall_f32(shift);
+            v_float32x4 v_scale = v_setall_f32(scale);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= size.width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src = v_load(src + x);
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_float32x4 v_tmp1 = v_cvt_f32(v_src1);
+                v_float32x4 v_tmp2 = v_cvt_f32(v_src2);
 
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, ushort, float>
-{
-    int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1q_u16(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, short, float>
-{
-    int operator () (const schar * src, short * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1q_s16(dst + x, v_dst);
-        }
+                v_tmp1 = v_tmp1 * v_scale + v_shift;
+                v_tmp2 = v_tmp2 * v_scale + v_shift;
 
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, int, float>
-{
-    int operator () (const schar * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
-            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<schar, float, float>
-{
-    int operator () (const schar * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
-            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
-        }
-
-        return x;
-    }
-};
-
-// from ushort
-
-template <>
-struct cvtScale_SIMD<ushort, uchar, float>
-{
-    int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
+                v_store(dst + x, v_round(v_tmp1));
+                v_store(dst + x + cWidth, v_round(v_tmp2));
+            }
         }
+        #endif
 
-        return x;
+        for(; x < size.width; x++ )
+            dst[x] = saturate_cast<int>(src[x]*scale + shift);
     }
-};
+}
 
-template <>
-struct cvtScale_SIMD<ushort, schar, float>
+template <typename T, typename DT>
+struct Cvt_SIMD
 {
-    int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
+    int operator() (const T *, DT *, int) const
     {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1_s8(dst + x, vqmovn_s16(v_dst));
-        }
-
-        return x;
+        return 0;
     }
 };
 
-template <>
-struct cvtScale_SIMD<ushort, ushort, float>
-{
-    int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1q_u16(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
+#if CV_SIMD128
+// from uchar
 
 template <>
-struct cvtScale_SIMD<ushort, short, float>
+struct Cvt_SIMD<uchar, schar>
 {
-    int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
+    int operator() (const uchar * src, schar * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1q_s16(dst + x, v_dst);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
+                v_store_low(dst + x, v_pack(v_src, v_src));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<ushort, int, float>
+struct Cvt_SIMD<uchar, ushort>
 {
-    int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
+    int operator() (const uchar * src, ushort * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
-
-            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
-            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_store(dst + x, v_load_expand(src + x));
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<ushort, float, float>
+struct Cvt_SIMD<uchar, short>
 {
-    int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
+    int operator() (const uchar * src, short * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
-            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_int16x8 v_src = v_reinterpret_as_s16(v_load_expand(src + x));
+                v_store(dst + x, v_src);
+            }
         }
-
         return x;
     }
 };
 
-// from short
-
 template <>
-struct cvtScale_SIMD<short, uchar, float>
+struct Cvt_SIMD<uchar, int>
 {
-    int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
+    int operator() (const uchar * src, int * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vld1q_s16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint16x8 v_src = v_load_expand(src + x);
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_reinterpret_as_s32(v_src1));
+                v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<short, schar, float>
+struct Cvt_SIMD<uchar, float>
 {
-    int operator () (const short * src, schar * dst, int width, float scale, float shift) const
+    int operator() (const uchar * src, float * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vld1q_s16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1_s8(dst + x, vqmovn_s16(v_dst));
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint16x8 v_src = v_load_expand(src + x);
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
+            }
         }
-
         return x;
     }
 };
 
-template <>
-struct cvtScale_SIMD<short, ushort, float>
-{
-    int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            int16x8_t v_src = vld1q_s16(src + x);
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1q_u16(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
+// from schar
 
 template <>
-struct cvtScale_SIMD<short, float, float>
+struct Cvt_SIMD<schar, uchar>
 {
-    int operator () (const short * src, float * dst, int width, float scale, float shift) const
+    int operator() (const schar * src, uchar * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vld1q_s16(src + x);
-            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
-            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_pack_u_store(dst + x, v_load_expand(src + x));
         }
 
         return x;
     }
 };
 
-// from int
-
 template <>
-struct cvtScale_SIMD<int, uchar, float>
+struct Cvt_SIMD<schar, short>
 {
-    int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
+    int operator() (const schar * src, short * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_store(dst + x, v_load_expand(src + x));
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<int, schar, float>
+struct Cvt_SIMD<schar, ushort>
 {
-    int operator () (const int * src, schar * dst, int width, float scale, float shift) const
+    int operator() (const schar * src, ushort * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1_s8(dst + x, vqmovn_s16(v_dst));
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_int16x8 v_src = v_load_expand(src + x);
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_pack_u(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
-template <>
-struct cvtScale_SIMD<int, ushort, float>
-{
-    int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1q_u16(dst + x, v_dst);
-        }
-
-        return x;
-    }
-};
 
 template <>
-struct cvtScale_SIMD<int, short, float>
+struct Cvt_SIMD<schar, int>
 {
-    int operator () (const int * src, short * dst, int width, float scale, float shift) const
+    int operator() (const schar * src, int * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1q_s16(dst + x, v_dst);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src = v_load_expand(src + x);
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_src1);
+                v_store(dst + x + cWidth, v_src2);
+            }
         }
-
         return x;
     }
 };
 
-// from float
-
 template <>
-struct cvtScale_SIMD<float, uchar, float>
+struct Cvt_SIMD<schar, float>
 {
-    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
+    int operator() (const schar * src, float * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1_u8(dst + x, vqmovn_u16(v_dst));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src = v_load_expand(src + x);
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_cvt_f32(v_src1));
+                v_store(dst + x + cWidth, v_cvt_f32(v_src2));
+            }
         }
-
         return x;
     }
 };
 
-template <>
-struct cvtScale_SIMD<float, schar, float>
-{
-    int operator () (const float * src, schar * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1_s8(dst + x, vqmovn_s16(v_dst));
-        }
-
-        return x;
-    }
-};
+// from ushort
 
 template <>
-struct cvtScale_SIMD<float, ushort, float>
+struct Cvt_SIMD<ushort, uchar>
 {
-    int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
+    int operator() (const ushort * src, uchar * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-            vst1q_u16(dst + x, v_dst);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_store(dst + x, v_pack(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<float, short, float>
+struct Cvt_SIMD<ushort, schar>
 {
-    int operator () (const float * src, short * dst, int width, float scale, float shift) const
+    int operator() (const ushort * src, schar * dst, int width) const
     {
         int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
-            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
-
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-            vst1q_s16(dst + x, v_dst);
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_uint32x4 v_dst10, v_dst11, v_dst20, v_dst21;
+                v_expand(v_src1, v_dst10, v_dst11);
+                v_expand(v_src2, v_dst20, v_dst21);
+
+                v_store(dst + x, v_pack(
+                    v_pack(v_reinterpret_as_s32(v_dst10), v_reinterpret_as_s32(v_dst11)),
+                    v_pack(v_reinterpret_as_s32(v_dst20), v_reinterpret_as_s32(v_dst21))));
+            }
         }
-
-        return x;
-    }
-};
-
-template <>
-struct cvtScale_SIMD<float, int, float>
-{
-    int operator () (const float * src, int * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 4; x += 4)
-            vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
-
         return x;
     }
 };
 
 template <>
-struct cvtScale_SIMD<float, float, float>
-{
-    int operator () (const float * src, float * dst, int width, float scale, float shift) const
-    {
-        int x = 0;
-        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
-
-        for ( ; x <= width - 4; x += 4)
-            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
-
-        return x;
-    }
-};
-
-#endif
-
-template<typename T, typename DT, typename WT> static void
-cvtScale_( const T* src, size_t sstep,
-           DT* dst, size_t dstep, Size size,
-           WT scale, WT shift )
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    cvtScale_SIMD<T, DT, WT> vop;
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = vop(src, dst, size.width, scale, shift);
-
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            DT t0, t1;
-            t0 = saturate_cast<DT>(src[x]*scale + shift);
-            t1 = saturate_cast<DT>(src[x+1]*scale + shift);
-            dst[x] = t0; dst[x+1] = t1;
-            t0 = saturate_cast<DT>(src[x+2]*scale + shift);
-            t1 = saturate_cast<DT>(src[x+3]*scale + shift);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<DT>(src[x]*scale + shift);
-    }
-}
-
-//vz optimized template specialization
-template<> void
-cvtScale_<short, short, float>( const short* src, size_t sstep,
-           short* dst, size_t dstep, Size size,
-           float scale, float shift )
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        #if CV_SSE2
-            if(USE_SSE2)
-            {
-                __m128 scale128 = _mm_set1_ps (scale);
-                __m128 shift128 = _mm_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
-                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
-                    __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
-                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
-                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
-                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
-                    r0 = _mm_cvtps_epi32(rf0);
-                    r1 = _mm_cvtps_epi32(rf1);
-                    r0 = _mm_packs_epi32(r0, r1);
-                    _mm_storeu_si128((__m128i*)(dst + x), r0);
-                }
-            }
-        #elif CV_NEON
-        float32x4_t v_shift = vdupq_n_f32(shift);
-        for(; x <= size.width - 8; x += 8 )
-        {
-            int16x8_t v_src = vld1q_s16(src + x);
-            float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
-            float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
-
-            v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
-            v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
-                                            vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
-        }
-        #endif
-
-        for(; x < size.width; x++ )
-            dst[x] = saturate_cast<short>(src[x]*scale + shift);
-    }
-}
-
-template<> void
-cvtScale_<short, int, float>( const short* src, size_t sstep,
-           int* dst, size_t dstep, Size size,
-           float scale, float shift )
+struct Cvt_SIMD<ushort, short>
 {
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
+    int operator() (const ushort * src, short * dst, int width) const
     {
         int x = 0;
-
-        #if CV_TRY_AVX2
-        if (CV_CPU_HAS_SUPPORT_AVX2)
-        {
-            opt_AVX2::cvtScale_s16s32f32Line_AVX2(src, dst, scale, shift, size.width);
-            continue;
-        }
-        #endif
-        #if CV_SSE2
-        if (USE_SSE2)//~5X
+        if (hasSIMD128())
         {
-            __m128 scale128 = _mm_set1_ps (scale);
-            __m128 shift128 = _mm_set1_ps (shift);
-            for(; x <= size.width - 8; x += 8 )
+            int cWidth = v_uint16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
             {
-                __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
-
-                __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
-                __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
-                rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
-                rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
-
-                _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
-                _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
+                v_uint16x8 v_src = v_load(src + x);
+                v_uint32x4 v_dst0, v_dst1;
+                v_expand(v_src, v_dst0, v_dst1);
+                v_store(dst + x, v_pack(v_reinterpret_as_s32(v_dst0), v_reinterpret_as_s32(v_dst1)));
             }
         }
-        #elif CV_NEON
-        float32x4_t v_shift = vdupq_n_f32(shift);
-        for(; x <= size.width - 8; x += 8 )
-        {
-            int16x8_t v_src = vld1q_s16(src + x);
-            float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
-            float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
-
-            v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
-            v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
-
-            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
-            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
-        }
-        #endif
-
-        for(; x < size.width; x++ )
-            dst[x] = saturate_cast<int>(src[x]*scale + shift);
-    }
-}
-
-template <typename T, typename DT>
-struct Cvt_SIMD
-{
-    int operator() (const T *, DT *, int) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SSE2
-
-// from double
-
-template <>
-struct Cvt_SIMD<double, uchar>
-{
-    int operator() (const double * src, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
-            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
-            v_src0 = _mm_movelh_ps(v_src0, v_src1);
-            v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
-                                            _mm_cvtps_epi32(v_src1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
-        }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<double, schar>
+struct Cvt_SIMD<ushort, int>
 {
-    int operator() (const double * src, schar * dst, int width) const
+    int operator() (const ushort * src, int * dst, int width) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
-            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
-            v_src0 = _mm_movelh_ps(v_src0, v_src1);
-            v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
-                                            _mm_cvtps_epi32(v_src1));
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint16x8 v_src = v_load(src + x);
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_reinterpret_as_s32(v_src1));
+                v_store(dst + x + cWidth, v_reinterpret_as_s32(v_src2));
+            }
         }
-
         return x;
     }
 };
 
-#if CV_TRY_SSE4_1
-
-template <>
-struct Cvt_SIMD<double, ushort>
-{
-    bool haveSIMD;
-    Cvt_SIMD() { haveSIMD = CV_CPU_HAS_SUPPORT_SSE4_1; }
-
-    int operator() (const double * src, ushort * dst, int width) const
-    {
-        if (haveSIMD)
-            return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width);
-        else
-            return 0;
-    }
-};
-
-#endif // CV_SSE4_1
-
 template <>
-struct Cvt_SIMD<double, short>
+struct Cvt_SIMD<ushort, float>
 {
-    int operator() (const double * src, short * dst, int width) const
+    int operator() (const ushort * src, float * dst, int width) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
-            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
-
-            v_src0 = _mm_movelh_ps(v_src0, v_src1);
-            v_src1 = _mm_movelh_ps(v_src2, v_src3);
-
-            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
-                                            _mm_cvtps_epi32(v_src1));
-            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint16x8 v_src = v_load(src + x);
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x + cWidth, v_cvt_f32(v_reinterpret_as_s32(v_src2)));
+            }
         }
-
         return x;
     }
 };
 
-template <>
-struct Cvt_SIMD<double, int>
-{
-    int operator() (const double * src, int * dst, int width) const
-    {
-        int x = 0;
-
-        if (!USE_SSE2)
-            return x;
 
-        for ( ; x <= width - 4; x += 4)
-        {
-            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-            v_src0 = _mm_movelh_ps(v_src0, v_src1);
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
-        }
-
-        return x;
-    }
-};
+// from short
 
 template <>
-struct Cvt_SIMD<double, float>
+struct Cvt_SIMD<short, uchar>
 {
-    int operator() (const double * src, float * dst, int width) const
+    int operator() (const short * src, uchar * dst, int width) const
     {
         int x = 0;
-
-        if (!USE_SSE2)
-            return x;
-
-        for ( ; x <= width - 4; x += 4)
+        if (hasSIMD128())
         {
-            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
-            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
-
-            _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_store(dst + x, v_pack_u(v_src1, v_src2));
+            }
         }
-
-        return x;
-    }
-};
-
-
-#elif CV_NEON
-
-// from uchar
-
-template <>
-struct Cvt_SIMD<uchar, schar>
-{
-    int operator() (const uchar * src, schar * dst, int width) const
-    {
-        int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
-            vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
-
-        return x;
-    }
-};
-
-
-template <>
-struct Cvt_SIMD<uchar, ushort>
-{
-    int operator() (const uchar * src, ushort * dst, int width) const
-    {
-        int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
-            vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<uchar, short>
-{
-    int operator() (const uchar * src, short * dst, int width) const
-    {
-        int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
-            vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
-
-        return x;
-    }
-};
-
-template <>
-struct Cvt_SIMD<uchar, int>
+struct Cvt_SIMD<short, schar>
 {
-    int operator() (const uchar * src, int * dst, int width) const
+    int operator() (const short * src, schar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
-            vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_store(dst + x, v_pack(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<uchar, float>
+struct Cvt_SIMD<short, ushort>
 {
-    int operator() (const uchar * src, float * dst, int width) const
+    int operator() (const short * src, ushort * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
-            vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
-            vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
+            int cWidth = v_int16x8::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+            {
+                v_int16x8 v_src = v_load(src + x);
+                v_int32x4 v_dst1, v_dst2;
+                v_expand(v_src, v_dst1, v_dst2);
+                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
+            }
         }
-
         return x;
     }
 };
 
-// from schar
-
 template <>
-struct Cvt_SIMD<schar, uchar>
+struct Cvt_SIMD<short, int>
 {
-    int operator() (const schar * src, uchar * dst, int width) const
+    int operator() (const short * src, int * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
-            vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
-
+        if (hasSIMD128())
+        {
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src = v_load(src + x);
+                v_int32x4 v_dst1, v_dst2;
+                v_expand(v_src, v_dst1, v_dst2);
+                v_store(dst + x, v_dst1);
+                v_store(dst + x + cWidth, v_dst2);
+            }
+        }
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<schar, short>
+struct Cvt_SIMD<short, float>
 {
-    int operator() (const schar * src, short * dst, int width) const
+    int operator() (const short * src, float * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
-            vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
-
+        if (hasSIMD128())
+        {
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int16x8 v_src = v_load(src + x);
+                v_int32x4 v_dst1, v_dst2;
+                v_expand(v_src, v_dst1, v_dst2);
+                v_store(dst + x, v_cvt_f32(v_dst1));
+                v_store(dst + x + cWidth, v_cvt_f32(v_dst2));
+            }
+        }
         return x;
     }
 };
 
+// from int
+
 template <>
-struct Cvt_SIMD<schar, ushort>
+struct Cvt_SIMD<int, uchar>
 {
-    int operator() (const schar * src, ushort * dst, int width) const
+    int operator() (const int * src, uchar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
-                                            vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
+                v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
+                v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
+                v_store(dst + x, v_pack(v_dst1, v_dst2));
+            }
         }
-
         return x;
     }
 };
 
-
 template <>
-struct Cvt_SIMD<schar, int>
+struct Cvt_SIMD<int, schar>
 {
-    int operator() (const schar * src, int * dst, int width) const
+    int operator() (const int * src, schar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
-            vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
+                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
+                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
+                v_store(dst + x, v_pack(v_dst1, v_dst2));
+            }
         }
-
         return x;
     }
 };
 
+
 template <>
-struct Cvt_SIMD<schar, float>
+struct Cvt_SIMD<int, ushort>
 {
-    int operator() (const schar * src, float * dst, int width) const
+    int operator() (const int * src, ushort * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
-            vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
-            vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_store(dst + x, v_pack_u(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
-// from ushort
-
 template <>
-struct Cvt_SIMD<ushort, uchar>
+struct Cvt_SIMD<int, short>
 {
-    int operator() (const ushort * src, uchar * dst, int width) const
+    int operator() (const int * src, short * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
-            vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
+                v_store(dst + x, v_pack(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<ushort, schar>
+struct Cvt_SIMD<int, float>
 {
-    int operator() (const ushort * src, schar * dst, int width) const
+    int operator() (const int * src, float * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
-            int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
-            int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
-            int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
-            int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
-
-            vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
-                                          vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
+            int cWidth = v_int32x4::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_store(dst + x, v_cvt_f32(v_load(src + x)));
         }
-
         return x;
     }
 };
 
+// from float
+
 template <>
-struct Cvt_SIMD<ushort, short>
+struct Cvt_SIMD<float, uchar>
 {
-    int operator() (const ushort * src, short * dst, int width) const
+    int operator() (const float * src, uchar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
-            int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
-
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int32x4 v_src1 = v_round(v_load(src + x));
+                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+                v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
+                v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
+                v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
+                v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
+                v_store(dst + x, v_pack(v_dst1, v_dst2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<ushort, int>
+struct Cvt_SIMD<float, schar>
 {
-    int operator() (const ushort * src, int * dst, int width) const
+    int operator() (const float * src, schar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
-            vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int32x4 v_src1 = v_round(v_load(src + x));
+                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+                v_int32x4 v_src3 = v_round(v_load(src + x + cWidth * 2));
+                v_int32x4 v_src4 = v_round(v_load(src + x + cWidth * 3));
+                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
+                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
+                v_store(dst + x, v_pack(v_dst1, v_dst2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<ushort, float>
+struct Cvt_SIMD<float, ushort>
 {
-    int operator() (const ushort * src, float * dst, int width) const
+    int operator() (const float * src, ushort * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint16x8_t v_src = vld1q_u16(src + x);
-            vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
-            vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src1 = v_round(v_load(src + x));
+                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+                v_store(dst + x, v_pack_u(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
-// from short
-
 template <>
-struct Cvt_SIMD<short, uchar>
+struct Cvt_SIMD<float, short>
 {
-    int operator() (const short * src, uchar * dst, int width) const
+    int operator() (const float * src, short * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
-            vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src1 = v_round(v_load(src + x));
+                v_int32x4 v_src2 = v_round(v_load(src + x + cWidth));
+                v_store(dst + x, v_pack(v_src1, v_src2));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<short, schar>
+struct Cvt_SIMD<float, int>
 {
-    int operator() (const short * src, schar * dst, int width) const
+    int operator() (const float * src, int * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
-            vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
+            int cWidth = v_float32x4::nlanes;
+            for (; x <= width - cWidth; x += cWidth)
+                v_store(dst + x, v_round(v_load(src + x)));
         }
-
         return x;
     }
 };
+#if CV_SIMD128_64F
+// from double
 
 template <>
-struct Cvt_SIMD<short, ushort>
+struct Cvt_SIMD<double, uchar>
 {
-    int operator() (const short * src, ushort * dst, int width) const
+    int operator() (const double * src, uchar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vld1q_s16(src + x);
-            uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
-            uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
-            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+
+                v_src0 = v_combine_low(v_src0, v_src1);
+                v_src1 = v_combine_low(v_src2, v_src3);
 
+                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
+                v_pack_u_store(dst + x, v_dst);
+            }
+        }
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<short, int>
+struct Cvt_SIMD<double, schar>
 {
-    int operator() (const short * src, int * dst, int width) const
+    int operator() (const double * src, schar * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vld1q_s16(src + x);
-            vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
-            vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
+
+                v_src0 = v_combine_low(v_src0, v_src1);
+                v_src1 = v_combine_low(v_src2, v_src3);
 
+                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
+                v_store_low(dst + x, v_pack(v_dst, v_dst));
+            }
+        }
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<short, float>
+struct Cvt_SIMD<double, ushort>
 {
-    int operator() (const short * src, float * dst, int width) const
+    int operator() (const double * src, ushort * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+#if CV_TRY_SSE4_1
+        if (CV_CPU_HAS_SUPPORT_SSE4_1)
+            return opt_SSE4_1::Cvt_SIMD_f64u16_SSE41(src, dst, width);
+#endif
+        if (hasSIMD128())
         {
-            int16x8_t v_src = vld1q_s16(src + x);
-            vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
-            vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
 
+                v_src0 = v_combine_low(v_src0, v_src1);
+                v_src1 = v_combine_low(v_src2, v_src3);
+
+                v_uint16x8 v_dst = v_pack_u(v_round(v_src0), v_round(v_src1));
+                v_store(dst + x, v_dst);
+            }
+        }
         return x;
     }
 };
 
-// from int
-
 template <>
-struct Cvt_SIMD<int, uchar>
+struct Cvt_SIMD<double, short>
 {
-    int operator() (const int * src, uchar * dst, int width) const
+    int operator() (const double * src, short * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
-            int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
-            uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
-            uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
-            vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
+                v_float32x4 v_src2 = v_cvt_f32(v_load(src + x + cWidth * 2));
+                v_float32x4 v_src3 = v_cvt_f32(v_load(src + x + cWidth * 3));
 
+                v_src0 = v_combine_low(v_src0, v_src1);
+                v_src1 = v_combine_low(v_src2, v_src3);
+
+                v_int16x8 v_dst = v_pack(v_round(v_src0), v_round(v_src1));
+                v_store(dst + x, v_dst);
+            }
+        }
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<int, schar>
+struct Cvt_SIMD<double, int>
 {
-    int operator() (const int * src, schar * dst, int width) const
+    int operator() (const double * src, int * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
-            int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
-            int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
-            int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
-            vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
 
+                v_store(dst + x, v_round(v_combine_low(v_src0, v_src1)));
+            }
+        }
         return x;
     }
 };
 
-
 template <>
-struct Cvt_SIMD<int, ushort>
+struct Cvt_SIMD<double, float>
 {
-    int operator() (const int * src, ushort * dst, int width) const
+    int operator() (const double * src, float * dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
-            vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src0 = v_cvt_f32(v_load(src + x));
+                v_float32x4 v_src1 = v_cvt_f32(v_load(src + x + cWidth));
 
+                v_store(dst + x, v_combine_low(v_src0, v_src1));
+            }
+        }
         return x;
     }
 };
 
+// to double
+
 template <>
-struct Cvt_SIMD<int, short>
+struct Cvt_SIMD<uchar, double>
 {
-    int operator() (const int * src, short * dst, int width) const
+    int operator() (const uchar* src, double* dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
-            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_uint16x8 v_src = v_load_expand(src + x);
+                v_uint32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x + cWidth * 2, v_cvt_f64(v_reinterpret_as_s32(v_src2)));
+                v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_reinterpret_as_s32(v_src2)));
+            }
         }
-
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<int, float>
+struct Cvt_SIMD<schar, double>
 {
-    int operator() (const int * src, float * dst, int width) const
+    int operator() (const schar* src, double* dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 4; x += 4)
-            vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
-
+        if (hasSIMD128())
+        {
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 4; x += cWidth * 4)
+            {
+                v_int16x8 v_src = v_load_expand(src + x);
+                v_int32x4 v_src1, v_src2;
+                v_expand(v_src, v_src1, v_src2);
+                v_store(dst + x, v_cvt_f64(v_src1));
+                v_store(dst + x + cWidth, v_cvt_f64_high(v_src1));
+                v_store(dst + x + cWidth * 2, v_cvt_f64(v_src2));
+                v_store(dst + x + cWidth * 3, v_cvt_f64_high(v_src2));
+            }
+        }
         return x;
     }
 };
 
-// from float
-
 template <>
-struct Cvt_SIMD<float, uchar>
+struct Cvt_SIMD<ushort, double>
 {
-    int operator() (const float * src, uchar * dst, int width) const
+    int operator() (const ushort* src, double* dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
-            uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
-            uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
-            uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
-            uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
-            uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
-            vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_uint32x4 v_src = v_load_expand(src + x);
 
+                v_store(dst + x, v_cvt_f64(v_reinterpret_as_s32(v_src)));
+                v_store(dst + x + cWidth, v_cvt_f64_high(v_reinterpret_as_s32(v_src)));
+            }
+        }
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<float, schar>
+struct Cvt_SIMD<short, double>
 {
-    int operator() (const float * src, schar * dst, int width) const
+    int operator() (const short* src, double* dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 16; x += 16)
+        if (hasSIMD128())
         {
-            int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
-            int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
-            int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
-            int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
-            int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
-            int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
-            vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src = v_load_expand(src + x);
 
+                v_store(dst + x, v_cvt_f64(v_src));
+                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
+            }
+        }
         return x;
     }
 };
 
-
 template <>
-struct Cvt_SIMD<float, ushort>
+struct Cvt_SIMD<int, double>
 {
-    int operator() (const float * src, ushort * dst, int width) const
+    int operator() (const int* src, double* dst, int width) const
     {
         int x = 0;
-
-        for ( ; x <= width - 8; x += 8)
+        if (hasSIMD128())
         {
-            uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
-            uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
-            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
-        }
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_int32x4 v_src = v_load(src + x);
 
+                v_store(dst + x, v_cvt_f64(v_src));
+                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
+            }
+        }
         return x;
     }
 };
 
 template <>
-struct Cvt_SIMD<float, int>
+struct Cvt_SIMD<float, double>
 {
-    int operator() (const float * src, int * dst, int width) const
+    int operator() (const float* src, double* dst, int width) const
     {
         int x = 0;
+        if (hasSIMD128())
+        {
+            int cWidth = v_float64x2::nlanes;
+            for (; x <= width - cWidth * 2; x += cWidth * 2)
+            {
+                v_float32x4 v_src = v_load(src + x);
 
-        for ( ; x <= width - 4; x += 4)
-            vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
-
+                v_store(dst + x, v_cvt_f64(v_src));
+                v_store(dst + x + cWidth, v_cvt_f64_high(v_src));
+            }
+        }
         return x;
     }
 };
-
-#endif
+#endif // CV_SIMD128_64F
+#endif // CV_SIMD128
 
 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
@@ -4618,48 +3595,6 @@ cvt_( const T* src, size_t sstep,
     }
 }
 
-//vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
-template<>  void
-cvt_<float, short>( const float* src, size_t sstep,
-     short* dst, size_t dstep, Size size )
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-        #if   CV_SSE2
-        if(USE_SSE2)
-        {
-            for( ; x <= size.width - 8; x += 8 )
-            {
-                __m128 src128 = _mm_loadu_ps (src + x);
-                __m128i src_int128 = _mm_cvtps_epi32 (src128);
-
-                src128 = _mm_loadu_ps (src + x + 4);
-                __m128i src1_int128 = _mm_cvtps_epi32 (src128);
-
-                src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
-                _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
-            }
-        }
-        #elif CV_NEON
-        for( ; x <= size.width - 8; x += 8 )
-        {
-            float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
-                                           vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
-            vst1q_s16(dst + x, v_dst);
-        }
-        #endif
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<short>(src[x]);
-    }
-
-}
-
-
 template<typename T> static void
 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
 {
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index ae5e4a7..c5b5ede 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -806,6 +806,11 @@ struct ConvertScaleAbsOp : public BaseElemWiseOp
     {
         cvtest::add(src[0], alpha, Mat(), 0, Scalar::all(gamma[0]), dst, CV_8UC(src[0].channels()), true);
     }
+    int getRandomType(RNG& rng)
+    {
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1,
+            ninputs > 1 ? ARITHM_MAX_CHANNELS : 4);
+    }
     double getMaxErr(int)
     {
         return 1;
-- 
2.7.4