imgproc:simd Enable VSX and wide universal intrinsics for accumulate operations

author Sayed Adel <seiko@imavr.com>

Tue, 2 Oct 2018 18:44:03 +0000 (20:44 +0200)

committer Sayed Adel <seiko@imavr.com>

Thu, 11 Oct 2018 02:37:12 +0000 (04:37 +0200)
author Sayed Adel <seiko@imavr.com>
Tue, 2 Oct 2018 18:44:03 +0000 (20:44 +0200)
committer Sayed Adel <seiko@imavr.com>
Thu, 11 Oct 2018 02:37:12 +0000 (04:37 +0200)
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt

index 5cfb616..1caadbb 100644 (file)
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -1,3 +1,3 @@
  set(the_description "Image Processing")
-ocv_add_dispatched_file(accum SSE2 AVX NEON)
+ocv_add_dispatched_file(accum SSE4_1 AVX AVX2)
  ocv_define_module(imgproc opencv_core WRAP java python js)
diff --git a/modules/imgproc/perf/perf_accumulate.cpp b/modules/imgproc/perf/perf_accumulate.cpp

index f9cd80a..c52b31e 100644 (file)
--- a/modules/imgproc/perf/perf_accumulate.cpp
+++ b/modules/imgproc/perf/perf_accumulate.cpp
@@ -5,94 +5,102 @@
  
  namespace opencv_test {
  
-#ifdef HAVE_OPENVX
-PERF_TEST_P(Size_MatType, Accumulate,
-    testing::Combine(
-        testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-        testing::Values(CV_16SC1, CV_32FC1)
-    )
-)
-#else
-PERF_TEST_P( Size_MatType, Accumulate,
-             testing::Combine(
-                 testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-                 testing::Values(CV_32FC1)
-             )
-           )
-#endif
-{
-    Size sz = get<0>(GetParam());
-    int dstType = get<1>(GetParam());
-
-    Mat src(sz, CV_8UC1);
-    Mat dst(sz, dstType);
-
-    declare.time(100);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    TEST_CYCLE() accumulate(src, dst);
-
-    SANITY_CHECK_NOTHING();
-}
-
-#ifdef HAVE_OPENVX
-PERF_TEST_P(Size_MatType, AccumulateSquare,
-    testing::Combine(
-        testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-        testing::Values(CV_16SC1, CV_32FC1)
-    )
-)
-#else
-PERF_TEST_P( Size_MatType, AccumulateSquare,
-             testing::Combine(
-                 testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-                 testing::Values(CV_32FC1)
-             )
-           )
-#endif
-{
-    Size sz = get<0>(GetParam());
-    int dstType = get<1>(GetParam());
-
-    Mat src(sz, CV_8UC1);
-    Mat dst(sz, dstType);
-
-    declare.time(100);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    TEST_CYCLE() accumulateSquare(src, dst);
-
-    SANITY_CHECK_NOTHING();
-}
-
-#ifdef HAVE_OPENVX
-PERF_TEST_P(Size_MatType, AccumulateWeighted,
-    testing::Combine(
-        testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-        testing::Values(CV_8UC1, CV_32FC1)
-    )
-)
-#else
-PERF_TEST_P( Size_MatType, AccumulateWeighted,
-             testing::Combine(
-                 testing::Values(::perf::szODD, ::perf::szQVGA, ::perf::szVGA, ::perf::sz1080p),
-                 testing::Values(CV_32FC1)
-             )
-           )
-#endif
-{
-    Size sz = get<0>(GetParam());
-    int dstType = get<1>(GetParam());
-
-    Mat src(sz, CV_8UC1);
-    Mat dst(sz, dstType);
-
-    declare.time(100);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    TEST_CYCLE() accumulateWeighted(src, dst, 0.314);
-
-    SANITY_CHECK_NOTHING();
-}
+typedef Size_MatType Accumulate;
+
+#define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1
+#define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3
+#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1
+#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1
+
+#define PERF_ACCUMULATE_INIT(_FLTC)                    \
+    const Size srcSize = get<0>(GetParam());           \
+    const int srcType = get<1>(GetParam());            \
+    const int dstType = _FLTC(CV_MAT_CN(srcType));     \
+    Mat src1(srcSize, srcType), dst(srcSize, dstType); \
+    declare.in(src1, dst, WARMUP_RNG).out(dst);
+
+#define PERF_ACCUMULATE_MASK_INIT(_FLTC) \
+    PERF_ACCUMULATE_INIT(_FLTC)          \
+    Mat mask(srcSize, CV_8UC1);          \
+    declare.in(mask, WARMUP_RNG);
+
+#define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN)           \
+    PERF_TEST_P(Accumulate, _NAME,                                   \
+        testing::Combine(                                            \
+            testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD),  \
+            testing::Values(_TYPES)                                  \
+        )                                                            \
+    )                                                                \
+    {                                                                \
+        _INIT                                                        \
+        TEST_CYCLE() _FUN;                                           \
+        SANITY_CHECK_NOTHING();                                      \
+    }
+
+/////////////////////////////////// Accumulate ///////////////////////////////////
+
+PERF_TEST_P_ACCUMULATE(Accumulate, MAT_TYPES_ACCUMLATE,
+        PERF_ACCUMULATE_INIT(CV_32FC), accumulate(src1, dst))
+
+PERF_TEST_P_ACCUMULATE(AccumulateMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulate(src1, dst, mask))
+
+PERF_TEST_P_ACCUMULATE(AccumulateDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT(CV_64FC), accumulate(src1, dst))
+
+PERF_TEST_P_ACCUMULATE(AccumulateDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulate(src1, dst, mask))
+
+///////////////////////////// AccumulateSquare ///////////////////////////////////
+
+PERF_TEST_P_ACCUMULATE(Square, MAT_TYPES_ACCUMLATE,
+    PERF_ACCUMULATE_INIT(CV_32FC), accumulateSquare(src1, dst))
+
+PERF_TEST_P_ACCUMULATE(SquareMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateSquare(src1, dst, mask))
+
+PERF_TEST_P_ACCUMULATE(SquareDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT(CV_64FC), accumulateSquare(src1, dst))
+
+PERF_TEST_P_ACCUMULATE(SquareDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateSquare(src1, dst, mask))
+
+///////////////////////////// AccumulateProduct ///////////////////////////////////
+
+#define PERF_ACCUMULATE_INIT_2(_FLTC) \
+    PERF_ACCUMULATE_INIT(_FLTC)       \
+    Mat src2(srcSize, srcType);       \
+    declare.in(src2);
+
+#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \
+    PERF_ACCUMULATE_MASK_INIT(_FLTC)       \
+    Mat src2(srcSize, srcType);            \
+    declare.in(src2);
+
+PERF_TEST_P_ACCUMULATE(Product, MAT_TYPES_ACCUMLATE,
+    PERF_ACCUMULATE_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst))
+
+PERF_TEST_P_ACCUMULATE(ProductMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT_2(CV_32FC), accumulateProduct(src1, src2, dst, mask))
+
+PERF_TEST_P_ACCUMULATE(ProductDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst))
+
+PERF_TEST_P_ACCUMULATE(ProductDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT_2(CV_64FC), accumulateProduct(src1, src2, dst, mask))
+
+///////////////////////////// AccumulateWeighted ///////////////////////////////////
+
+PERF_TEST_P_ACCUMULATE(Weighted, MAT_TYPES_ACCUMLATE,
+    PERF_ACCUMULATE_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123))
+
+PERF_TEST_P_ACCUMULATE(WeightedMask, MAT_TYPES_ACCUMLATE_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_32FC), accumulateWeighted(src1, dst, 0.123, mask))
+
+PERF_TEST_P_ACCUMULATE(WeightedDouble, MAT_TYPES_ACCUMLATE_D,
+    PERF_ACCUMULATE_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456))
+
+PERF_TEST_P_ACCUMULATE(WeightedDoubleMask, MAT_TYPES_ACCUMLATE_D_C,
+    PERF_ACCUMULATE_MASK_INIT(CV_64FC), accumulateWeighted(src1, dst, 0.123456, mask))
  
  } // namespace
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp

index 583f247..7bca93d 100644 (file)
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -8,63 +8,43 @@
  void acc_##suffix(const type* src, acctype* dst, \
                           const uchar* mask, int len, int cn) \
  { \
-    CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
  } \
  void accSqr_##suffix(const type* src, acctype* dst, \
                              const uchar* mask, int len, int cn) \
  { \
-    CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
  } \
  void accProd_##suffix(const type* src1, const type* src2, \
                               acctype* dst, const uchar* mask, int len, int cn) \
  { \
-    CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
  } \
  void accW_##suffix(const type* src, acctype* dst, \
                            const uchar* mask, int len, int cn, double alpha) \
  { \
-    CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \
+    CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha),  CV_CPU_DISPATCH_MODES_ALL); \
  }
  #define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \
  void acc_##suffix(const type* src, acctype* dst, \
                           const uchar* mask, int len, int cn) \
  { \
-    CV_CPU_CALL_AVX(acc_avx_##suffix, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
  } \
  void accSqr_##suffix(const type* src, acctype* dst, \
                              const uchar* mask, int len, int cn) \
  { \
-    CV_CPU_CALL_AVX(accSqr_avx_##suffix, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
  } \
  void accProd_##suffix(const type* src1, const type* src2, \
                               acctype* dst, const uchar* mask, int len, int cn) \
  { \
-    CV_CPU_CALL_AVX(accProd_avx_##suffix, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \
-    CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \
+    CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
  } \
  void accW_##suffix(const type* src, acctype* dst, \
                            const uchar* mask, int len, int cn, double alpha) \
  { \
-    CV_CPU_CALL_AVX(accW_avx_##suffix, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \
-    CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \
+    CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha),  CV_CPU_DISPATCH_MODES_ALL); \
  }
  #define DECLARATE_ACC_FUNCS(suffix, type, acctype) \
  void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \
@@ -114,22 +94,8 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
  void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);
  void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);
  
-// accumulate series optimized by AVX
-void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn);
-void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn);
-void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn);
-void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn);
-void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn);
-void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn);
-void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn);
-void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn);
-void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn);
-void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha);
-void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);
-void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);
-
  #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-
+// todo: remove AVX branch after support it by universal intrinsics
  template <typename T, typename AT>
  void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
  {
@@ -171,7 +137,11 @@ void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int
              }
          }
      }
-
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
  }
  
  template<typename T, typename AT> void
@@ -215,6 +185,11 @@ accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int
              }
          }
      }
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
  }
  
  template<typename T, typename AT> void
@@ -259,6 +234,11 @@ accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int
              }
          }
      }
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
  }
  
  template<typename T, typename AT> void
@@ -303,77 +283,81 @@ accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double
              }
          }
      }
+#if CV_AVX && !CV_AVX2
+    _mm256_zeroupper();
+#elif CV_SIMD
+    vx_cleanup();
+#endif
  }
-
-#if CV_SIMD128
-
  void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint8x16 v_src  = v_load(src + x);
-            v_uint16x8 v_src0, v_src1;
+            v_uint8 v_src  = vx_load(src + x);
+            v_uint16 v_src0, v_src1;
              v_expand(v_src, v_src0, v_src1);
  
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
              v_expand(v_src0, v_src00, v_src01);
              v_expand(v_src1, v_src10, v_src11);
  
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
          }
      }
      else
      {
-        v_uint8x16 v_0 = v_setall_u8(0);
+        v_uint8 v_0 = vx_setall_u8(0);
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src = v_load(src + x);
+                v_uint8 v_src = vx_load(src + x);
                  v_src = v_src & v_mask;
-                v_uint16x8 v_src0, v_src1;
+                v_uint16 v_src0, v_src1;
                  v_expand(v_src, v_src0, v_src1);
  
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
  
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_mask;
                  v_src1 = v_src1 & v_mask;
                  v_src2 = v_src2 & v_mask;
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
                  v_expand(v_src2, v_src20, v_src21);
  
-                v_uint32x4 v_src000, v_src001, v_src010, v_src011;
-                v_uint32x4 v_src100, v_src101, v_src110, v_src111;
-                v_uint32x4 v_src200, v_src201, v_src210, v_src211;
+                v_uint32 v_src000, v_src001, v_src010, v_src011;
+                v_uint32 v_src100, v_src101, v_src110, v_src111;
+                v_uint32 v_src200, v_src201, v_src210, v_src211;
                  v_expand(v_src00, v_src000, v_src001);
                  v_expand(v_src01, v_src010, v_src011);
                  v_expand(v_src10, v_src100, v_src101);
@@ -381,135 +365,169 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
                  v_expand(v_src20, v_src200, v_src201);
                  v_expand(v_src21, v_src210, v_src211);
  
-                v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011;
-                v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111;
-                v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211;
+                v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
+                v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
+                v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
                  v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
-                v_load_deinterleave(dst + ((x + 4) * cn), v_dst001, v_dst101, v_dst201);
-                v_load_deinterleave(dst + ((x + 8) * cn), v_dst010, v_dst110, v_dst210);
-                v_load_deinterleave(dst + ((x + 12) * cn), v_dst011, v_dst111, v_dst211);
-
-                v_store_interleave(dst + (x * cn), v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200)));
-                v_store_interleave(dst + ((x + 4) * cn), v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201)));
-                v_store_interleave(dst + ((x + 8) * cn), v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210)));
-                v_store_interleave(dst + ((x + 12) * cn), v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211)));
+                v_load_deinterleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
+                v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
+                v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
+
+                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
+                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
+                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
+                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
+                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
+                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
+                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
+                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
+                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
+                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
+                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
+                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+
+                v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
+                v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
+                v_store_interleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
+                v_store_interleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
              }
          }
      }
-
+#endif // CV_SIMD
      acc_general_(src, dst, mask, len, cn, x);
  }
  
  void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_src0, v_src1;
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_src0, v_src1;
              v_expand(v_src, v_src0, v_src1);
  
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
          }
      }
      else
      {
          if (cn == 1)
          {
-            v_uint16x8 v_0 = v_setall_u16(0);
+            v_uint16 v_0 = vx_setall_u16(0);
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src = v_load(src + x);
+                v_uint16 v_src = vx_load(src + x);
                  v_src = v_src & v_mask;
-                v_uint32x4 v_src0, v_src1;
+                v_uint32 v_src0, v_src1;
                  v_expand(v_src, v_src0, v_src1);
  
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
              }
          }
          else if (cn == 3)
          {
-            v_uint16x8 v_0 = v_setall_u16(0);
+            v_uint16 v_0 = vx_setall_u16(0);
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_mask;
                  v_src1 = v_src1 & v_mask;
                  v_src2 = v_src2 & v_mask;
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
                  v_expand(v_src2, v_src20, v_src21);
  
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_dst10 + v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_dst20 + v_cvt_f32(v_reinterpret_as_s32(v_src20)));
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_dst11 + v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_dst21 + v_cvt_f32(v_reinterpret_as_s32(v_src21)));
+                v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));
+                v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));
+                v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));
+                v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));
+                v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));
+                v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD
      acc_general_(src, dst, mask, len, cn, x);
  }
-
+// todo: remove AVX branch after support it by universal intrinsics
  void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256 v_dst = _mm256_loadu_ps(dst + x);
+            v_dst = _mm256_add_ps(v_src, v_dst);
+            _mm256_storeu_ps(dst + x, v_dst);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_store(dst + x, v_load(dst + x) + v_load(src + x));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src + x + 4));
+            v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
+            v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_float32x4 v_0 = v_setzero_f32();
+        v_float32 v_0 = vx_setzero_f32();
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint16x8 v_masku16 = v_load_expand(mask + x);
-                v_uint32x4 v_masku320, v_masku321;
+                v_uint16 v_masku16 = vx_load_expand(mask + x);
+                v_uint32 v_masku320, v_masku321;
                  v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
  
-                v_store(dst + x, v_load(dst + x) + (v_load(src + x) & v_mask0));
-                v_store(dst + x + 4, v_load(dst + x + 4) + (v_load(src + x + 4) & v_mask1));
+                v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint16x8 v_masku16 = v_load_expand(mask + x);
-                v_uint32x4 v_masku320, v_masku321;
+                v_uint16 v_masku16 = vx_load_expand(mask + x);
+                v_uint32 v_masku320, v_masku321;
                  v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
  
-                v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                  v_src00 = v_src00 & v_mask0;
                  v_src01 = v_src01 & v_mask1;
                  v_src10 = v_src10 & v_mask0;
@@ -517,55 +535,56 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                  v_src20 = v_src20 & v_mask0;
                  v_src21 = v_src21 & v_mask1;
  
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
                  v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
              }
          }
      }
-
+#endif // CV_SIMD
      acc_general_(src, dst, mask, len, cn, x);
  }
  
-#if CV_SIMD128_64F
  void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint8x16 v_src  = v_load(src + x);
-            v_uint16x8 v_int0, v_int1;
+            v_uint8 v_src  = vx_load(src + x);
+            v_uint16 v_int0, v_int1;
              v_expand(v_src, v_int0, v_int1);
  
-            v_uint32x4 v_int00, v_int01, v_int10, v_int11;
+            v_uint32 v_int00, v_int01, v_int10, v_int11;
              v_expand(v_int0, v_int00, v_int01);
              v_expand(v_int1, v_int10, v_int11);
  
-            v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-            v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-            v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-            v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-            v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-            v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-            v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-            v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-            v_float64x2 v_dst4 = v_load(dst + x + 8);
-            v_float64x2 v_dst5 = v_load(dst + x + 10);
-            v_float64x2 v_dst6 = v_load(dst + x + 12);
-            v_float64x2 v_dst7 = v_load(dst + x + 14);
+            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+            v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+            v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+            v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+            v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
+            v_float64 v_dst4 = vx_load(dst + x + step * 4);
+            v_float64 v_dst5 = vx_load(dst + x + step * 5);
+            v_float64 v_dst6 = vx_load(dst + x + step * 6);
+            v_float64 v_dst7 = vx_load(dst + x + step * 7);
  
              v_dst0 = v_dst0 + v_src0;
              v_dst1 = v_dst1 + v_src1;
@@ -577,50 +596,50 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
              v_dst7 = v_dst7 + v_src7;
  
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
-            v_store(dst + x + 8, v_dst4);
-            v_store(dst + x + 10, v_dst5);
-            v_store(dst + x + 12, v_dst6);
-            v_store(dst + x + 14, v_dst7);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
+            v_store(dst + x + step * 4, v_dst4);
+            v_store(dst + x + step * 5, v_dst5);
+            v_store(dst + x + step * 6, v_dst6);
+            v_store(dst + x + step * 7, v_dst7);
          }
      }
      else
      {
-        v_uint8x16 v_0 = v_setall_u8(0);
+        v_uint8 v_0 = vx_setall_u8(0);
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint8x16 v_src  = v_load(src + x);
+                v_uint8 v_src  = vx_load(src + x);
                  v_src = v_src & v_mask;
-                v_uint16x8 v_int0, v_int1;
+                v_uint16 v_int0, v_int1;
                  v_expand(v_src, v_int0, v_int1);
  
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11;
+                v_uint32 v_int00, v_int01, v_int10, v_int11;
                  v_expand(v_int0, v_int00, v_int01);
                  v_expand(v_int1, v_int10, v_int11);
  
-                v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
-                v_float64x2 v_dst4 = v_load(dst + x + 8);
-                v_float64x2 v_dst5 = v_load(dst + x + 10);
-                v_float64x2 v_dst6 = v_load(dst + x + 12);
-                v_float64x2 v_dst7 = v_load(dst + x + 14);
+                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
+                v_float64 v_dst4 = vx_load(dst + x + step * 4);
+                v_float64 v_dst5 = vx_load(dst + x + step * 5);
+                v_float64 v_dst6 = vx_load(dst + x + step * 6);
+                v_float64 v_dst7 = vx_load(dst + x + step * 7);
  
                  v_dst0 = v_dst0 + v_src0;
                  v_dst1 = v_dst1 + v_src1;
@@ -632,34 +651,34 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                  v_dst7 = v_dst7 + v_src7;
  
                  v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
-                v_store(dst + x + 8, v_dst4);
-                v_store(dst + x + 10, v_dst5);
-                v_store(dst + x + 12, v_dst6);
-                v_store(dst + x + 14, v_dst7);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
+                v_store(dst + x + step * 4, v_dst4);
+                v_store(dst + x + step * 5, v_dst5);
+                v_store(dst + x + step * 6, v_dst6);
+                v_store(dst + x + step * 7, v_dst7);
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_mask;
                  v_src1 = v_src1 & v_mask;
                  v_src2 = v_src2 & v_mask;
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
                  v_expand(v_src2, v_src20, v_src21);
  
-                v_uint32x4 v_src000, v_src001, v_src010, v_src011;
-                v_uint32x4 v_src100, v_src101, v_src110, v_src111;
-                v_uint32x4 v_src200, v_src201, v_src210, v_src211;
+                v_uint32 v_src000, v_src001, v_src010, v_src011;
+                v_uint32 v_src100, v_src101, v_src110, v_src111;
+                v_uint32 v_src200, v_src201, v_src210, v_src211;
                  v_expand(v_src00, v_src000, v_src001);
                  v_expand(v_src01, v_src010, v_src011);
                  v_expand(v_src10, v_src100, v_src101);
@@ -667,9 +686,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                  v_expand(v_src20, v_src200, v_src201);
                  v_expand(v_src21, v_src210, v_src211);
  
-                v_float64x2 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;
-                v_float64x2 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;
-                v_float64x2 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;
+                v_float64 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;
+                v_float64 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;
+                v_float64 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;
                  v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
                  v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
                  v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001)));
@@ -695,56 +714,58 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                  v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
                  v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
  
-                v_float64x2 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;
-                v_float64x2 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;
-                v_float64x2 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;
+                v_float64 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;
+                v_float64 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;
+                v_float64 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;
                  v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000);
-                v_load_deinterleave(dst + ((x + 2) * cn), v_dst0001, v_dst1001, v_dst2001);
-                v_load_deinterleave(dst + ((x + 4) * cn), v_dst0010, v_dst1010, v_dst2010);
-                v_load_deinterleave(dst + ((x + 6) * cn), v_dst0011, v_dst1011, v_dst2011);
-                v_load_deinterleave(dst + ((x + 8) * cn), v_dst0100, v_dst1100, v_dst2100);
-                v_load_deinterleave(dst + ((x + 10) * cn), v_dst0101, v_dst1101, v_dst2101);
-                v_load_deinterleave(dst + ((x + 12) * cn), v_dst0110, v_dst1110, v_dst2110);
-                v_load_deinterleave(dst + ((x + 14) * cn), v_dst0111, v_dst1111, v_dst2111);
+                v_load_deinterleave(dst + ((x + step) * cn), v_dst0001, v_dst1001, v_dst2001);
+                v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst0010, v_dst1010, v_dst2010);
+                v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst0011, v_dst1011, v_dst2011);
+                v_load_deinterleave(dst + ((x + step * 4) * cn), v_dst0100, v_dst1100, v_dst2100);
+                v_load_deinterleave(dst + ((x + step * 5) * cn), v_dst0101, v_dst1101, v_dst2101);
+                v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);
+                v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);
  
                  v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);
-                v_store_interleave(dst + ((x + 2) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
-                v_store_interleave(dst + ((x + 4) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
-                v_store_interleave(dst + ((x + 6) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
-                v_store_interleave(dst + ((x + 8) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
-                v_store_interleave(dst + ((x + 10) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
-                v_store_interleave(dst + ((x + 12) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
-                v_store_interleave(dst + ((x + 14) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
+                v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
+                v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
+                v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
+                v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
+                v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
+                v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
+                v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      acc_general_(src, dst, mask, len, cn, x);
  }
  
  void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src  = v_load(src + x);
-            v_uint32x4 v_int0, v_int1;
+            v_uint16 v_src  = vx_load(src + x);
+            v_uint32 v_int0, v_int1;
              v_expand(v_src, v_int0, v_int1);
  
-            v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-            v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
+            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
  
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
              v_dst0 = v_dst0 + v_src0;
              v_dst1 = v_dst1 + v_src1;
@@ -752,34 +773,34 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
              v_dst3 = v_dst3 + v_src3;
  
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
          }
      }
      else
      {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src  = v_load(src + x);
+                v_uint16 v_src  = vx_load(src + x);
                  v_src = v_src & v_mask;
-                v_uint32x4 v_int0, v_int1;
+                v_uint32 v_int0, v_int1;
                  v_expand(v_src, v_int0, v_int1);
  
-                v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-                v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
+                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
  
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
                  v_dst0 = v_dst0 + v_src0;
                  v_dst1 = v_dst1 + v_src1;
@@ -787,178 +808,207 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
                  v_dst3 = v_dst3 + v_src3;
  
                  v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
              }
          }
          if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_mask;
                  v_src1 = v_src1 & v_mask;
                  v_src2 = v_src2 & v_mask;
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                  v_expand(v_src0, v_int00, v_int01);
                  v_expand(v_src1, v_int10, v_int11);
                  v_expand(v_src2, v_int20, v_int21);
  
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
+
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
  
                  v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      acc_general_(src, dst, mask, len, cn, x);
  }
  
  void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float32::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));
+            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));
+            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
+            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
+            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
+            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
+            _mm256_storeu_pd(dst + x, v_dst0);
+            _mm256_storeu_pd(dst + x + 4, v_dst1);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float32x4 v_src = v_load(src + x);
-            v_float64x2 v_src0 = v_cvt_f64(v_src);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_src);
+            v_float32 v_src = vx_load(src + x);
+            v_float64 v_src0 = v_cvt_f64(v_src);
+            v_float64 v_src1 = v_cvt_f64_high(v_src);
  
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_store(dst + x, vx_load(dst + x) + v_src0);
+            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint64x2 v_0 = v_setzero_u64();
+        v_uint64 v_0 = vx_setzero_u64();
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float32x4 v_src = v_load(src + x);
-                v_float64x2 v_src0 = v_cvt_f64(v_src) & v_mask0;
-                v_float64x2 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
+                v_float32 v_src = vx_load(src + x);
+                v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
+                v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
  
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+                v_store(dst + x, vx_load(dst + x) + v_src0);
+                v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float32x4 v_src0, v_src1, v_src2;
+                v_float32 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_float64x2 v_src00 = v_cvt_f64(v_src0) & v_mask0;
-                v_float64x2 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
-                v_float64x2 v_src10 = v_cvt_f64(v_src1) & v_mask0;
-                v_float64x2 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
-                v_float64x2 v_src20 = v_cvt_f64(v_src2) & v_mask0;
-                v_float64x2 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
+                v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
+                v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
+                v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
+                v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
+                v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
+
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
                  v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      acc_general_(src, dst, mask, len, cn, x);
  }
  
  void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 4 ; x += 4)
+        {
+            __m256d v_src = _mm256_loadu_pd(src + x);
+            __m256d v_dst = _mm256_loadu_pd(dst + x);
+            v_dst = _mm256_add_pd(v_dst, v_src);
+            _mm256_storeu_pd(dst + x, v_dst);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float64x2 v_src0 = v_load(src + x);
-            v_float64x2 v_src1 = v_load(src + x + 2);
+            v_float64 v_src0 = vx_load(src + x);
+            v_float64 v_src1 = vx_load(src + x + step);
  
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_store(dst + x, vx_load(dst + x) + v_src0);
+            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint64x2 v_0 = v_setzero_u64();
+        v_uint64 v_0 = vx_setzero_u64();
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float64x2 v_src0 = v_load(src + x);
-                v_float64x2 v_src1 = v_load(src + x + 2);
+                v_float64 v_src0 = vx_load(src + x);
+                v_float64 v_src1 = vx_load(src + x + step);
  
-                v_store(dst + x, v_load(dst + x) + (v_src0 & v_mask0));
-                v_store(dst + x + 2, v_load(dst + x + 2) + (v_src1 & v_mask1));
+                v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint32x4 v_masku32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float64x2 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
+                v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                  v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                  v_src00 = v_src00 & v_mask0;
                  v_src01 = v_src01 & v_mask1;
                  v_src10 = v_src10 & v_mask0;
@@ -966,107 +1016,88 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                  v_src20 = v_src20 & v_mask0;
                  v_src21 = v_src21 & v_mask1;
  
-                v_float64x2 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
+                v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
                  v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      acc_general_(src, dst, mask, len, cn, x);
  }
-#else
-void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-
-void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-
-void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-
-void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    acc_general_(src, dst, mask, len, cn, 0);
-}
-#endif
  
  // square accumulate optimized by universal intrinsic
  void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint8x16 v_src  = v_load(src + x);
-            v_uint16x8 v_src0, v_src1;
+            v_uint8 v_src  = vx_load(src + x);
+            v_uint16 v_src0, v_src1;
              v_expand(v_src, v_src0, v_src1);
              v_src0 = v_mul_wrap(v_src0, v_src0);
              v_src1 = v_mul_wrap(v_src1, v_src1);
  
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
              v_expand(v_src0, v_src00, v_src01);
              v_expand(v_src1, v_src10, v_src11);
  
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
          }
      }
      else
      {
-        v_uint8x16 v_0 = v_setall_u8(0);
+        v_uint8 v_0 = vx_setall_u8(0);
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_0 == v_mask);
-                v_uint8x16 v_src = v_load(src + x);
+                v_uint8 v_src = vx_load(src + x);
                  v_src = v_src & v_mask;
-                v_uint16x8 v_src0, v_src1;
+                v_uint16 v_src0, v_src1;
                  v_expand(v_src, v_src0, v_src1);
                  v_src0 = v_mul_wrap(v_src0, v_src0);
                  v_src1 = v_mul_wrap(v_src1, v_src1);
  
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
  
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_0 == v_mask);
  
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_mask;
                  v_src1 = v_src1 & v_mask;
                  v_src2 = v_src2 & v_mask;
  
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
                  v_expand(v_src2, v_src20, v_src21);
@@ -1077,9 +1108,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                  v_src20 = v_mul_wrap(v_src20, v_src20);
                  v_src21 = v_mul_wrap(v_src21, v_src21);
  
-                v_uint32x4 v_src000, v_src001, v_src010, v_src011;
-                v_uint32x4 v_src100, v_src101, v_src110, v_src111;
-                v_uint32x4 v_src200, v_src201, v_src210, v_src211;
+                v_uint32 v_src000, v_src001, v_src010, v_src011;
+                v_uint32 v_src100, v_src101, v_src110, v_src111;
+                v_uint32 v_src200, v_src201, v_src210, v_src211;
                  v_expand(v_src00, v_src000, v_src001);
                  v_expand(v_src01, v_src010, v_src011);
                  v_expand(v_src10, v_src100, v_src101);
@@ -1087,90 +1118,103 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                  v_expand(v_src20, v_src200, v_src201);
                  v_expand(v_src21, v_src210, v_src211);
  
-                v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011;
-                v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111;
-                v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211;
+                v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
+                v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
+                v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
                  v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201);
-                v_load_deinterleave(dst + (x + 8) * cn, v_dst010, v_dst110, v_dst210);
-                v_load_deinterleave(dst + (x + 12) * cn, v_dst011, v_dst111, v_dst211);
-
-                v_store_interleave(dst + x * cn, v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200)));
-                v_store_interleave(dst + (x + 4) * cn, v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201)));
-                v_store_interleave(dst + (x + 8) * cn, v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210)));
-                v_store_interleave(dst + (x + 12) * cn, v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211)));
+                v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
+
+                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
+                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
+                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
+                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
+
+                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
+                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
+                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
+                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
+
+                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
+                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
+                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
+                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+
+                v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
+                v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
              }
          }
      }
-
+#endif // CV_SIMD
      accSqr_general_(src, dst, mask, len, cn, x);
  }
  
  void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_src0, v_src1;
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_src0, v_src1;
              v_expand(v_src, v_src0, v_src1);
  
-            v_float32x4 v_float0, v_float1;
+            v_float32 v_float0, v_float1;
              v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
              v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
-            v_float0 = v_float0 * v_float0;
-            v_float1 = v_float1 * v_float1;
  
-            v_store(dst + x, v_load(dst + x) + v_float0);
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_float1);
+            v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
          }
      }
      else
      {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
          if (cn == 1)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask0, v_mask1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask0, v_mask1;
                  v_expand(v_mask16, v_mask0, v_mask1);
                  v_mask0 = ~(v_mask0 == v_0);
                  v_mask1 = ~(v_mask1 == v_0);
-                v_uint16x8 v_src = v_load(src + x);
-                v_uint32x4 v_src0, v_src1;
+                v_uint16 v_src = vx_load(src + x);
+                v_uint32 v_src0, v_src1;
                  v_expand(v_src, v_src0, v_src1);
                  v_src0 = v_src0 & v_mask0;
                  v_src1 = v_src1 & v_mask1;
  
-                v_float32x4 v_float0, v_float1;
+                v_float32 v_float0, v_float1;
                  v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
                  v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
-                v_float0 = v_float0 * v_float0;
-                v_float1 = v_float1 * v_float1;
  
-                v_store(dst + x, v_load(dst + x) + v_float0);
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_float1);
+                v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
              }
          }
          else if (cn == 3)
          {
              for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
              {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask0, v_mask1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask0, v_mask1;
                  v_expand(v_mask16, v_mask0, v_mask1);
                  v_mask0 = ~(v_mask0 == v_0);
                  v_mask1 = ~(v_mask1 == v_0);
  
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                  v_expand(v_src0, v_int00, v_int01);
                  v_expand(v_src1, v_int10, v_int11);
                  v_expand(v_src2, v_int20, v_int21);
@@ -1181,653 +1225,650 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                  v_int20 = v_int20 & v_mask0;
                  v_int21 = v_int21 & v_mask1;
  
-                v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));
                  v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01));
                  v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10));
                  v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11));
                  v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20));
                  v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21));
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD
      accSqr_general_(src, dst, mask, len, cn, x);
  }
  
  void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256 v_dst = _mm256_loadu_ps(dst + x);
+            v_src = _mm256_mul_ps(v_src, v_src);
+            v_dst = _mm256_add_ps(v_src, v_dst);
+            _mm256_storeu_ps(dst + x, v_dst);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float32x4 v_src0 = v_load(src + x);
-            v_float32x4 v_src1 = v_load(src + x + 4);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
+            v_float32 v_src0 = vx_load(src + x);
+            v_float32 v_src1 = vx_load(src + x + step);
  
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask_0, v_mask_1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask_0, v_mask_1;
                  v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
-                v_float32x4 v_src0 = v_load(src + x);
-                v_float32x4 v_src1 = v_load(src + x + 4);
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_src0 = vx_load(src + x);
+                v_float32 v_src1 = vx_load(src + x + step);
                  v_src0 = v_src0 & v_mask0;
                  v_src1 = v_src1 & v_mask1;
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
  
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask16 = v_load_expand(mask + x);
-                v_uint32x4 v_mask_0, v_mask_1;
+                v_uint16 v_mask16 = vx_load_expand(mask + x);
+                v_uint32 v_mask_0, v_mask_1;
                  v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
  
-                v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
+                v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                  v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                  v_src00 = v_src00 & v_mask0;
                  v_src01 = v_src01 & v_mask1;
                  v_src10 = v_src10 & v_mask0;
                  v_src11 = v_src11 & v_mask1;
                  v_src20 = v_src20 & v_mask0;
                  v_src21 = v_src21 & v_mask1;
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-
-                v_float32x4 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
+
+                v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD
      accSqr_general_(src, dst, mask, len, cn, x);
  }
-#if CV_SIMD128_64F
+
  void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_int = v_load_expand(src + x);
+            v_uint16 v_int = vx_load_expand(src + x);
  
-            v_uint32x4 v_int0, v_int1;
+            v_uint32 v_int0, v_int1;
              v_expand(v_int, v_int0, v_int1);
  
-            v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-            v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-            v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
-            v_src2 = v_src2 * v_src2;
-            v_src3 = v_src3 * v_src3;
-
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-            v_dst0 += v_src0;
-            v_dst1 += v_src1;
-            v_dst2 += v_src2;
-            v_dst3 += v_src3;
+            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+            v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+            v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+            v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+            v_dst3 = v_fma(v_src3, v_src3, v_dst3);
  
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
          }
      }
      else
      {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src = v_load_expand(src + x);
-                v_uint16x8 v_int = v_src & v_mask;
+                v_uint16 v_src = vx_load_expand(src + x);
+                v_uint16 v_int = v_src & v_mask;
  
-                v_uint32x4 v_int0, v_int1;
+                v_uint32 v_int0, v_int1;
                  v_expand(v_int, v_int0, v_int1);
  
-                v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
-                v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
-                v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
-                v_src2 = v_src2 * v_src2;
-                v_src3 = v_src3 * v_src3;
-
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-                v_dst0 += v_src0;
-                v_dst1 += v_src1;
-                v_dst2 += v_src2;
-                v_dst3 += v_src3;
+                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
+                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
+                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
+
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
+
+                v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+                v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+                v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+                v_dst3 = v_fma(v_src3, v_src3, v_dst3);
  
                  v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
              }
          }
          else if (cn == 3)
          {
-            for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth)
+            for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
              {
-                v_uint8x16 v_src0, v_src1, v_src2;
+                v_uint8 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_uint16x8 v_int0, v_int1, v_int2, dummy;
-                v_expand(v_src0, v_int0, dummy);
-                v_expand(v_src1, v_int1, dummy);
-                v_expand(v_src2, v_int2, dummy);
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+
+                v_uint16 v_int0 = v_expand_low(v_src0);
+                v_uint16 v_int1 = v_expand_low(v_src1);
+                v_uint16 v_int2 = v_expand_low(v_src2);
+
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
                  v_int0 = v_int0 & v_mask;
                  v_int1 = v_int1 & v_mask;
                  v_int2 = v_int2 & v_mask;
  
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                  v_expand(v_int0, v_int00, v_int01);
                  v_expand(v_int1, v_int10, v_int11);
                  v_expand(v_int2, v_int20, v_int21);
  
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src02 = v_src02 * v_src02;
-                v_src03 = v_src03 * v_src03;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src12 = v_src12 * v_src12;
-                v_src13 = v_src13 * v_src13;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-                v_src22 = v_src22 * v_src22;
-                v_src23 = v_src23 * v_src23;
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
+
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst02 = v_fma(v_src02, v_src02, v_dst02);
+                v_dst03 = v_fma(v_src03, v_src03, v_dst03);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst12 = v_fma(v_src12, v_src12, v_dst12);
+                v_dst13 = v_fma(v_src13, v_src13, v_dst13);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+                v_dst22 = v_fma(v_src22, v_src22, v_dst22);
+                v_dst23 = v_fma(v_src23, v_src23, v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accSqr_general_(src, dst, mask, len, cn, x);
  }
  
  void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src  = v_load(src + x);
-            v_uint32x4 v_int_0, v_int_1;
+            v_uint16 v_src  = vx_load(src + x);
+            v_uint32 v_int_0, v_int_1;
              v_expand(v_src, v_int_0, v_int_1);
  
-            v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-            v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
+            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
  
-            v_float64x2 v_src0 = v_cvt_f64(v_int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
-            v_src2 = v_src2 * v_src2;
-            v_src3 = v_src3 * v_src3;
+            v_float64 v_src0 = v_cvt_f64(v_int0);
+            v_float64 v_src1 = v_cvt_f64_high(v_int0);
+            v_float64 v_src2 = v_cvt_f64(v_int1);
+            v_float64 v_src3 = v_cvt_f64_high(v_int1);
  
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-            v_dst0 += v_src0;
-            v_dst1 += v_src1;
-            v_dst2 += v_src2;
-            v_dst3 += v_src3;
+            v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+            v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+            v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+            v_dst3 = v_fma(v_src3, v_src3, v_dst3);
  
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
          }
      }
      else
      {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src = v_load(src + x);
+                v_uint16 v_src = vx_load(src + x);
                  v_src = v_src & v_mask;
-                v_uint32x4 v_int_0, v_int_1;
+                v_uint32 v_int_0, v_int_1;
                  v_expand(v_src, v_int_0, v_int_1);
  
-                v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-                v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
+                v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+                v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
  
-                v_float64x2 v_src0 = v_cvt_f64(v_int0);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
-                v_float64x2 v_src2 = v_cvt_f64(v_int1);
-                v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
-                v_src2 = v_src2 * v_src2;
-                v_src3 = v_src3 * v_src3;
+                v_float64 v_src0 = v_cvt_f64(v_int0);
+                v_float64 v_src1 = v_cvt_f64_high(v_int0);
+                v_float64 v_src2 = v_cvt_f64(v_int1);
+                v_float64 v_src3 = v_cvt_f64_high(v_int1);
  
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-                v_dst0 += v_src0;
-                v_dst1 += v_src1;
-                v_dst2 += v_src2;
-                v_dst3 += v_src3;
+                v_dst0 = v_fma(v_src0, v_src0, v_dst0);
+                v_dst1 = v_fma(v_src1, v_src1, v_dst1);
+                v_dst2 = v_fma(v_src2, v_src2, v_dst2);
+                v_dst3 = v_fma(v_src3, v_src3, v_dst3);
  
                  v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_src0, v_src1, v_src2;
+                v_uint16 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_mask;
                  v_src1 = v_src1 & v_mask;
                  v_src2 = v_src2 & v_mask;
-                v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
+                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                  v_expand(v_src0, v_int00, v_int01);
                  v_expand(v_src1, v_int10, v_int11);
                  v_expand(v_src2, v_int20, v_int21);
  
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src02 = v_src02 * v_src02;
-                v_src03 = v_src03 * v_src03;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src12 = v_src12 * v_src12;
-                v_src13 = v_src13 * v_src13;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-                v_src22 = v_src22 * v_src22;
-                v_src23 = v_src23 * v_src23;
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03;
-                v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13;
-                v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
+                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
+                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
+                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
+                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
+                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
+                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
+
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
+                v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
+                v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2)* cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4)* cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6)* cn, v_dst03, v_dst13, v_dst23);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_load_deinterleave(dst + (x + step)* cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2)* cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3)* cn, v_dst03, v_dst13, v_dst23);
+
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst02 = v_fma(v_src02, v_src02, v_dst02);
+                v_dst03 = v_fma(v_src03, v_src03, v_dst03);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst12 = v_fma(v_src12, v_src12, v_dst12);
+                v_dst13 = v_fma(v_src13, v_src13, v_dst13);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+                v_dst22 = v_fma(v_src22, v_src22, v_dst22);
+                v_dst23 = v_fma(v_src23, v_src23, v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accSqr_general_(src, dst, mask, len, cn, x);
  }
  
  void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float32::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src = _mm256_loadu_ps(src + x);
+            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
+            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
+            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
+            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
+            v_src0 = _mm256_mul_pd(v_src0, v_src0);
+            v_src1 = _mm256_mul_pd(v_src1, v_src1);
+            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
+            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
+            _mm256_storeu_pd(dst + x, v_dst0);
+            _mm256_storeu_pd(dst + x + 4, v_dst1);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float32x4 v_src = v_load(src + x);
-            v_float64x2 v_src0 = v_cvt_f64(v_src);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_src);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
+            v_float32 v_src = vx_load(src + x);
+            v_float64 v_src0 = v_cvt_f64(v_src);
+            v_float64 v_src1 = v_cvt_f64_high(v_src);
  
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);;
+                v_uint32 v_mask = vx_load_expand_q(mask + x);;
                  v_mask = ~(v_mask == v_0);
-                v_float32x4 v_src = v_load(src + x);
+                v_float32 v_src = vx_load(src + x);
                  v_src = v_src & v_reinterpret_as_f32(v_mask);
-                v_float64x2 v_src0 = v_cvt_f64(v_src);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_src);
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
+                v_float64 v_src0 = v_cvt_f64(v_src);
+                v_float64 v_src1 = v_cvt_f64_high(v_src);
  
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);
+                v_uint32 v_mask = vx_load_expand_q(mask + x);
                  v_mask = ~(v_mask == v_0);
  
-                v_float32x4 v_src0, v_src1, v_src2;
+                v_float32 v_src0, v_src1, v_src2;
                  v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
                  v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
                  v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
                  v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
  
-                v_float64x2 v_src00 = v_cvt_f64(v_src0);
-                v_float64x2 v_src01 = v_cvt_f64_high(v_src0);
-                v_float64x2 v_src10 = v_cvt_f64(v_src1);
-                v_float64x2 v_src11 = v_cvt_f64_high(v_src1);
-                v_float64x2 v_src20 = v_cvt_f64(v_src2);
-                v_float64x2 v_src21 = v_cvt_f64_high(v_src2);
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float64 v_src00 = v_cvt_f64(v_src0);
+                v_float64 v_src01 = v_cvt_f64_high(v_src0);
+                v_float64 v_src10 = v_cvt_f64(v_src1);
+                v_float64 v_src11 = v_cvt_f64_high(v_src1);
+                v_float64 v_src20 = v_cvt_f64(v_src2);
+                v_float64 v_src21 = v_cvt_f64_high(v_src2);
+
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accSqr_general_(src, dst, mask, len, cn, x);
  }
  
  void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 4 ; x += 4)
+        {
+            __m256d v_src = _mm256_loadu_pd(src + x);
+            __m256d v_dst = _mm256_loadu_pd(dst + x);
+            v_src = _mm256_mul_pd(v_src, v_src);
+            v_dst = _mm256_add_pd(v_dst, v_src);
+            _mm256_storeu_pd(dst + x, v_dst);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float64x2 v_src0 = v_load(src + x);
-            v_float64x2 v_src1 = v_load(src + x + 2);
-            v_src0 = v_src0 * v_src0;
-            v_src1 = v_src1 * v_src1;
-
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+            v_float64 v_src0 = vx_load(src + x);
+            v_float64 v_src1 = vx_load(src + x + step);
+            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint64x2 v_0 = v_setzero_u64();
+        v_uint64 v_0 = vx_setzero_u64();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
-                v_float64x2 v_src0 = v_load(src + x);
-                v_float64x2 v_src1 = v_load(src + x + 2);
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_src0 = vx_load(src + x);
+                v_float64 v_src1 = vx_load(src + x + step);
                  v_src0 = v_src0 & v_mask0;
                  v_src1 = v_src1 & v_mask1;
-                v_src0 = v_src0 * v_src0;
-                v_src1 = v_src1 * v_src1;
-
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 2, v_load(dst + x + 2) + v_src1);
+                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float64x2 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                  v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
-                v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21);
+                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
                  v_src00 = v_src00 & v_mask0;
                  v_src01 = v_src01 & v_mask1;
                  v_src10 = v_src10 & v_mask0;
                  v_src11 = v_src11 & v_mask1;
                  v_src20 = v_src20 & v_mask0;
                  v_src21 = v_src21 & v_mask1;
-                v_src00 = v_src00 * v_src00;
-                v_src01 = v_src01 * v_src01;
-                v_src10 = v_src10 * v_src10;
-                v_src11 = v_src11 * v_src11;
-                v_src20 = v_src20 * v_src20;
-                v_src21 = v_src21 * v_src21;
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
+                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
+                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
+                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
+                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
+                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accSqr_general_(src, dst, mask, len, cn, x);
  }
-#else
-void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-
-void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-
-void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-
-void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    accSqr_general_(src, dst, mask, len, cn, 0);
-}
-#endif
  
  // product accumulate optimized by universal intrinsic
  void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_uint32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint8x16 v_1src = v_load(src1 + x);
-            v_uint8x16 v_2src = v_load(src2 + x);
+            v_uint8 v_1src = vx_load(src1 + x);
+            v_uint8 v_2src = vx_load(src2 + x);
  
-            v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1;
-            v_expand(v_1src, v_1src0, v_1src1);
-            v_expand(v_2src, v_2src0, v_2src1);
+            v_uint16 v_src0, v_src1;
+            v_mul_expand(v_1src, v_2src, v_src0, v_src1);
  
-            v_uint16x8 v_src0, v_src1;
-            v_src0 = v_mul_wrap(v_1src0, v_2src0);
-            v_src1 = v_mul_wrap(v_1src1, v_2src1);
-
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
              v_expand(v_src0, v_src00, v_src01);
              v_expand(v_src1, v_src10, v_src11);
  
-            v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
          }
      }
      else
      {
-        v_uint8x16 v_0 = v_setzero_u8();
+        v_uint8 v_0 = vx_setzero_u8();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint8x16 v_1src = v_load(src1 + x);
-                v_uint8x16 v_2src = v_load(src2 + x);
+                v_uint8 v_1src = vx_load(src1 + x);
+                v_uint8 v_2src = vx_load(src2 + x);
                  v_1src = v_1src & v_mask;
                  v_2src = v_2src & v_mask;
  
-                v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1;
-                v_expand(v_1src, v_1src0, v_1src1);
-                v_expand(v_2src, v_2src0, v_2src1);
-
-                v_uint16x8 v_src0, v_src1;
-                v_src0 = v_mul_wrap(v_1src0, v_2src0);
-                v_src1 = v_mul_wrap(v_1src1, v_2src1);
+                v_uint16 v_src0, v_src1;
+                v_mul_expand(v_1src, v_2src, v_src0, v_src1);
  
-                v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
                  v_expand(v_src0, v_src00, v_src01);
                  v_expand(v_src1, v_src10, v_src11);
  
-                v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint8x16 v_mask = v_load(mask + x);
+                v_uint8 v_mask = vx_load(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                  v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                  v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
                  v_1src0 = v_1src0 & v_mask;
@@ -1837,23 +1878,12 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                  v_2src1 = v_2src1 & v_mask;
                  v_2src2 = v_2src2 & v_mask;
  
-                v_uint16x8 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
-                v_expand(v_1src0, v_1src00, v_1src01);
-                v_expand(v_1src1, v_1src10, v_1src11);
-                v_expand(v_1src2, v_1src20, v_1src21);
-                v_expand(v_2src0, v_2src00, v_2src01);
-                v_expand(v_2src1, v_2src10, v_2src11);
-                v_expand(v_2src2, v_2src20, v_2src21);
-
-                v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
-                v_src00 = v_mul_wrap(v_1src00, v_2src00);
-                v_src01 = v_mul_wrap(v_1src01, v_2src01);
-                v_src10 = v_mul_wrap(v_1src10, v_2src10);
-                v_src11 = v_mul_wrap(v_1src11, v_2src11);
-                v_src20 = v_mul_wrap(v_1src20, v_2src20);
-                v_src21 = v_mul_wrap(v_1src21, v_2src21);
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);
+                v_mul_expand(v_1src1, v_2src1, v_src10, v_src11);
+                v_mul_expand(v_1src2, v_2src2, v_src20, v_src21);
  
-                v_uint32x4 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;
+                v_uint32 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;
                  v_expand(v_src00, v_src000, v_src001);
                  v_expand(v_src01, v_src002, v_src003);
                  v_expand(v_src10, v_src100, v_src101);
@@ -1861,11 +1891,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                  v_expand(v_src20, v_src200, v_src201);
                  v_expand(v_src21, v_src202, v_src203);
  
-                v_float32x4 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;
+                v_float32 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;
                  v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201);
-                v_load_deinterleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202);
-                v_load_deinterleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
                  v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));
                  v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));
                  v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));
@@ -1880,82 +1910,78 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                  v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));
  
                  v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
-                v_store_interleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201);
-                v_store_interleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202);
-                v_store_interleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203);
+                v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
              }
          }
      }
-
+#endif // CV_SIMD
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
  
  void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_1src = v_load(src1 + x);
-            v_uint16x8 v_2src = v_load(src2 + x);
+            v_uint16 v_1src = vx_load(src1 + x);
+            v_uint16 v_2src = vx_load(src2 + x);
  
-            v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1;
+            v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
              v_expand(v_1src, v_1src0, v_1src1);
              v_expand(v_2src, v_2src0, v_2src1);
  
-            v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
-            v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
-            v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
-            v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
-
-            v_float32x4 v_src0 = v_1float0 * v_2float0;
-            v_float32x4 v_src1 = v_1float1 * v_2float1;
+            v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
+            v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
+            v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
+            v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
  
-            v_store(dst + x, v_load(dst + x) + v_src0);
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+            v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
          }
      }
      else
      {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_0 == v_mask);
  
-                v_uint16x8 v_1src = v_load(src1 + x) & v_mask;
-                v_uint16x8 v_2src = v_load(src2 + x) & v_mask;
+                v_uint16 v_1src = vx_load(src1 + x) & v_mask;
+                v_uint16 v_2src = vx_load(src2 + x) & v_mask;
  
-                v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1;
+                v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
                  v_expand(v_1src, v_1src0, v_1src1);
                  v_expand(v_2src, v_2src0, v_2src1);
  
-                v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
-                v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
-                v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
-                v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
+                v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
+                v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
+                v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
+                v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
  
-                v_float32x4 v_src0 = v_1float0 * v_2float0;
-                v_float32x4 v_src1 = v_1float1 * v_2float1;
-
-                v_store(dst + x, v_load(dst + x) + v_src0);
-                v_store(dst + x + 4, v_load(dst + x + 4) + v_src1);
+                v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_0 == v_mask);
  
-                v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                  v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                  v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
                  v_1src0 = v_1src0 & v_mask;
@@ -1965,7 +1991,7 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
                  v_2src1 = v_2src1 & v_mask;
                  v_2src2 = v_2src2 & v_mask;
  
-                v_uint32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
+                v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
                  v_expand(v_1src0, v_1src00, v_1src01);
                  v_expand(v_1src1, v_1src10, v_1src11);
                  v_expand(v_1src2, v_1src20, v_1src21);
@@ -1973,200 +1999,205 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
                  v_expand(v_2src1, v_2src10, v_2src11);
                  v_expand(v_2src2, v_2src20, v_2src21);
  
-                v_float32x4 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));
-                v_float32x4 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));
-                v_float32x4 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));
-                v_float32x4 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));
-                v_float32x4 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));
-                v_float32x4 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));
-                v_float32x4 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));
-                v_float32x4 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));
-                v_float32x4 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));
-                v_float32x4 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));
-                v_float32x4 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));
-                v_float32x4 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));
-
-                v_float32x4 v_src00 = v_1float00 * v_2float00;
-                v_float32x4 v_src01 = v_1float01 * v_2float01;
-                v_float32x4 v_src10 = v_1float10 * v_2float10;
-                v_float32x4 v_src11 = v_1float11 * v_2float11;
-                v_float32x4 v_src20 = v_1float20 * v_2float20;
-                v_float32x4 v_src21 = v_1float21 * v_2float21;
-
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));
+                v_float32 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));
+                v_float32 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));
+                v_float32 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));
+                v_float32 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));
+                v_float32 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));
+                v_float32 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));
+                v_float32 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));
+                v_float32 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));
+                v_float32 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));
+                v_float32 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));
+                v_float32 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));
+
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_1float00, v_2float00, v_dst00);
+                v_dst01 = v_fma(v_1float01, v_2float01, v_dst01);
+                v_dst10 = v_fma(v_1float10, v_2float10, v_dst10);
+                v_dst11 = v_fma(v_1float11, v_2float11, v_dst11);
+                v_dst20 = v_fma(v_1float20, v_2float20, v_dst20);
+                v_dst21 = v_fma(v_1float21, v_2float21, v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
  
  void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for (; x <= size - 8 ; x += 8)
+        {
+            __m256 v_src0 = _mm256_loadu_ps(src1 + x);
+            __m256 v_src1 = _mm256_loadu_ps(src2 + x);
+            __m256 v_dst = _mm256_loadu_ps(dst + x);
+            __m256 v_src = _mm256_mul_ps(v_src0, v_src1);
+            v_dst = _mm256_add_ps(v_src, v_dst);
+            _mm256_storeu_ps(dst + x, v_dst);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_store(dst + x, v_load(dst + x) + v_load(src1 + x) * v_load(src2 + x));
-            v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src1 + x + 4) * v_load(src2 + x + 4));
+            v_store(dst + x, v_fma(vx_load(src1 + x), vx_load(src2 + x), vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(vx_load(src1 + x + step), vx_load(src2 + x + step), vx_load(dst + x + step)));
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x);
-                v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
+                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
  
-                v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0));
-                v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1));
+                v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x);
-                v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4);
-                v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
+                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
+                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
+                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
  
-                v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
-                v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
+                v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
+                v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
                  v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
                  v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
-                v_load_deinterleave(src1 + (x + 4) * cn, v_1src01, v_1src11, v_1src21);
-                v_load_deinterleave(src2 + (x + 4) * cn, v_2src01, v_2src11, v_2src21);
+                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
+                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
  
-                v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
                  v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
-                v_store_interleave(dst + (x + 4) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
              }
          }
      }
-
+#endif // CV_SIMD
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
-#if CV_SIMD128_64F
+
  void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_1int  = v_load_expand(src1 + x);
-            v_uint16x8 v_2int  = v_load_expand(src2 + x);
+            v_uint16 v_1int  = vx_load_expand(src1 + x);
+            v_uint16 v_2int  = vx_load_expand(src2 + x);
  
-            v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+            v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
              v_expand(v_1int, v_1int_0, v_1int_1);
              v_expand(v_2int, v_2int_0, v_2int_1);
  
-            v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-            v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-            v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-            v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+            v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+            v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+            v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+            v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
  
-            v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-            v_dst0 += v_src0;
-            v_dst1 += v_src1;
-            v_dst2 += v_src2;
-            v_dst3 += v_src3;
+            v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+            v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+            v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+            v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
  
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
          }
      }
      else
      {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_1int = v_load_expand(src1 + x) & v_mask;
-                v_uint16x8 v_2int = v_load_expand(src2 + x) & v_mask;
+                v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;
+                v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;
  
-                v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+                v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
                  v_expand(v_1int, v_1int_0, v_1int_1);
                  v_expand(v_2int, v_2int_0, v_2int_1);
  
-                v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-                v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-                v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-                v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
-
-                v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-                v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-                v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
+                v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+                v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+                v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+                v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
  
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-                v_dst0 += v_src0;
-                v_dst1 += v_src1;
-                v_dst2 += v_src2;
-                v_dst3 += v_src3;
+                v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+                v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+                v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+                v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
  
                  v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
              }
          }
          else if (cn == 3)
          {
-            for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth)
+            for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
              {
-                v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                  v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                  v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
  
-                v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy;
-                v_expand(v_1src0, v_1int0, dummy);
-                v_expand(v_1src1, v_1int1, dummy);
-                v_expand(v_1src2, v_1int2, dummy);
-                v_expand(v_2src0, v_2int0, dummy);
-                v_expand(v_2src1, v_2int1, dummy);
-                v_expand(v_2src2, v_2int2, dummy);
+                v_uint16 v_1int0 = v_expand_low(v_1src0);
+                v_uint16 v_1int1 = v_expand_low(v_1src1);
+                v_uint16 v_1int2 = v_expand_low(v_1src2);
+                v_uint16 v_2int0 = v_expand_low(v_2src0);
+                v_uint16 v_2int1 = v_expand_low(v_2src1);
+                v_uint16 v_2int2 = v_expand_low(v_2src2);
  
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
                  v_1int0 = v_1int0 & v_mask;
                  v_1int1 = v_1int1 & v_mask;
@@ -2175,8 +2206,8 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
                  v_2int1 = v_2int1 & v_mask;
                  v_2int2 = v_2int2 & v_mask;
  
-                v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
-                v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
+                v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
+                v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
                  v_expand(v_1int0, v_1int00, v_1int01);
                  v_expand(v_1int1, v_1int10, v_1int11);
                  v_expand(v_1int2, v_1int20, v_1int21);
@@ -2184,128 +2215,122 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
                  v_expand(v_2int1, v_2int10, v_2int11);
                  v_expand(v_2int2, v_2int20, v_2int21);
  
-                v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64(v_reinterpret_as_s32(v_2int00));
-                v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int00));
-                v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64(v_reinterpret_as_s32(v_2int01));
-                v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int01));
-                v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64(v_reinterpret_as_s32(v_2int10));
-                v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int10));
-                v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64(v_reinterpret_as_s32(v_2int11));
-                v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int11));
-                v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64(v_reinterpret_as_s32(v_2int20));
-                v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int20));
-                v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64(v_reinterpret_as_s32(v_2int21));
-                v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int21));
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_dst00 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int00)), v_cvt_f64(v_reinterpret_as_s32(v_2int00)), v_dst00);
+                v_dst01 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)), v_dst01);
+                v_dst02 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int01)), v_cvt_f64(v_reinterpret_as_s32(v_2int01)), v_dst02);
+                v_dst03 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)), v_dst03);
+                v_dst10 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int10)), v_cvt_f64(v_reinterpret_as_s32(v_2int10)), v_dst10);
+                v_dst11 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)), v_dst11);
+                v_dst12 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int11)), v_cvt_f64(v_reinterpret_as_s32(v_2int11)), v_dst12);
+                v_dst13 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)), v_dst13);
+                v_dst20 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int20)), v_cvt_f64(v_reinterpret_as_s32(v_2int20)), v_dst20);
+                v_dst21 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)), v_dst21);
+                v_dst22 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int21)), v_cvt_f64(v_reinterpret_as_s32(v_2int21)), v_dst22);
+                v_dst23 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)), v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
  
  void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_1src  = v_load(src1 + x);
-            v_uint16x8 v_2src  = v_load(src2 + x);
+            v_uint16 v_1src  = vx_load(src1 + x);
+            v_uint16 v_2src  = vx_load(src2 + x);
  
-            v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+            v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
              v_expand(v_1src, v_1int_0, v_1int_1);
              v_expand(v_2src, v_2int_0, v_2int_1);
  
-            v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-            v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-            v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-            v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+            v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+            v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+            v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+            v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
  
-            v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
+            v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+            v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+            v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+            v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
  
-            v_dst0 = v_dst0 + v_src0;
-            v_dst1 = v_dst1 + v_src1;
-            v_dst2 = v_dst2 + v_src2;
-            v_dst3 = v_dst3 + v_src3;
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
          }
      }
      else
      {
-        v_uint16x8 v_0 = v_setzero_u16();
+        v_uint16 v_0 = vx_setzero_u16();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_1src = v_load(src1 + x);
-                v_uint16x8 v_2src = v_load(src2 + x);
+                v_uint16 v_1src = vx_load(src1 + x);
+                v_uint16 v_2src = vx_load(src2 + x);
                  v_1src = v_1src & v_mask;
                  v_2src = v_2src & v_mask;
  
-                v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
+                v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
                  v_expand(v_1src, v_1int_0, v_1int_1);
                  v_expand(v_2src, v_2int_0, v_2int_1);
  
-                v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0);
-                v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1);
-                v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0);
-                v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1);
+                v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
+                v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
+                v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
+                v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
  
-                v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0);
-                v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0);
-                v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1);
-                v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1);
+                v_float64 v_dst0 = vx_load(dst + x);
+                v_float64 v_dst1 = vx_load(dst + x + step);
+                v_float64 v_dst2 = vx_load(dst + x + step * 2);
+                v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-                v_float64x2 v_dst0 = v_load(dst + x);
-                v_float64x2 v_dst1 = v_load(dst + x + 2);
-                v_float64x2 v_dst2 = v_load(dst + x + 4);
-                v_float64x2 v_dst3 = v_load(dst + x + 6);
+                v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
+                v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
+                v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
+                v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
  
-                v_dst0 = v_dst0 + v_src0;
-                v_dst1 = v_dst1 + v_src1;
-                v_dst2 = v_dst2 + v_src2;
-                v_dst3 = v_dst3 + v_src3;
                  v_store(dst + x, v_dst0);
-                v_store(dst + x + 2, v_dst1);
-                v_store(dst + x + 4, v_dst2);
-                v_store(dst + x + 6, v_dst3);
+                v_store(dst + x + step, v_dst1);
+                v_store(dst + x + step * 2, v_dst2);
+                v_store(dst + x + step * 3, v_dst3);
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint16x8 v_mask = v_load_expand(mask + x);
+                v_uint16 v_mask = vx_load_expand(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                  v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                  v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
                  v_1src0 = v_1src0 & v_mask;
@@ -2315,9 +2340,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
                  v_2src1 = v_2src1 & v_mask;
                  v_2src2 = v_2src2 & v_mask;
  
-                v_uint32x4 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
-                v_uint32x4 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
-                v_uint32x4 v_1int_20, v_1int_21, v_2int_20, v_2int_21;
+                v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
+                v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
+                v_uint32 v_1int_20, v_1int_21, v_2int_20, v_2int_21;
                  v_expand(v_1src0, v_1int_00, v_1int_01);
                  v_expand(v_1src1, v_1int_10, v_1int_11);
                  v_expand(v_1src2, v_1int_20, v_1int_21);
@@ -2325,103 +2350,125 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
                  v_expand(v_2src1, v_2int_10, v_2int_11);
                  v_expand(v_2src2, v_2int_20, v_2int_21);
  
-                v_int32x4 v_1int00 = v_reinterpret_as_s32(v_1int_00);
-                v_int32x4 v_1int01 = v_reinterpret_as_s32(v_1int_01);
-                v_int32x4 v_1int10 = v_reinterpret_as_s32(v_1int_10);
-                v_int32x4 v_1int11 = v_reinterpret_as_s32(v_1int_11);
-                v_int32x4 v_1int20 = v_reinterpret_as_s32(v_1int_20);
-                v_int32x4 v_1int21 = v_reinterpret_as_s32(v_1int_21);
-                v_int32x4 v_2int00 = v_reinterpret_as_s32(v_2int_00);
-                v_int32x4 v_2int01 = v_reinterpret_as_s32(v_2int_01);
-                v_int32x4 v_2int10 = v_reinterpret_as_s32(v_2int_10);
-                v_int32x4 v_2int11 = v_reinterpret_as_s32(v_2int_11);
-                v_int32x4 v_2int20 = v_reinterpret_as_s32(v_2int_20);
-                v_int32x4 v_2int21 = v_reinterpret_as_s32(v_2int_21);
-
-                v_float64x2 v_src00 = v_cvt_f64(v_1int00) * v_cvt_f64(v_2int00);
-                v_float64x2 v_src01 = v_cvt_f64_high(v_1int00) * v_cvt_f64_high(v_2int00);
-                v_float64x2 v_src02 = v_cvt_f64(v_1int01) * v_cvt_f64(v_2int01);
-                v_float64x2 v_src03 = v_cvt_f64_high(v_1int01) * v_cvt_f64_high(v_2int01);
-                v_float64x2 v_src10 = v_cvt_f64(v_1int10) * v_cvt_f64(v_2int10);
-                v_float64x2 v_src11 = v_cvt_f64_high(v_1int10) * v_cvt_f64_high(v_2int10);
-                v_float64x2 v_src12 = v_cvt_f64(v_1int11) * v_cvt_f64(v_2int11);
-                v_float64x2 v_src13 = v_cvt_f64_high(v_1int11) * v_cvt_f64_high(v_2int11);
-                v_float64x2 v_src20 = v_cvt_f64(v_1int20) * v_cvt_f64(v_2int20);
-                v_float64x2 v_src21 = v_cvt_f64_high(v_1int20) * v_cvt_f64_high(v_2int20);
-                v_float64x2 v_src22 = v_cvt_f64(v_1int21) * v_cvt_f64(v_2int21);
-                v_float64x2 v_src23 = v_cvt_f64_high(v_1int21) * v_cvt_f64_high(v_2int21);
-
-                v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03;
-                v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13;
-                v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23;
+                v_int32 v_1int00 = v_reinterpret_as_s32(v_1int_00);
+                v_int32 v_1int01 = v_reinterpret_as_s32(v_1int_01);
+                v_int32 v_1int10 = v_reinterpret_as_s32(v_1int_10);
+                v_int32 v_1int11 = v_reinterpret_as_s32(v_1int_11);
+                v_int32 v_1int20 = v_reinterpret_as_s32(v_1int_20);
+                v_int32 v_1int21 = v_reinterpret_as_s32(v_1int_21);
+                v_int32 v_2int00 = v_reinterpret_as_s32(v_2int_00);
+                v_int32 v_2int01 = v_reinterpret_as_s32(v_2int_01);
+                v_int32 v_2int10 = v_reinterpret_as_s32(v_2int_10);
+                v_int32 v_2int11 = v_reinterpret_as_s32(v_2int_11);
+                v_int32 v_2int20 = v_reinterpret_as_s32(v_2int_20);
+                v_int32 v_2int21 = v_reinterpret_as_s32(v_2int_21);
+
+                v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
+                v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
+                v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
-                v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22);
-                v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23);
-
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_dst00 = v_fma(v_cvt_f64(v_1int00), v_cvt_f64(v_2int00), v_dst00);
+                v_dst01 = v_fma(v_cvt_f64_high(v_1int00), v_cvt_f64_high(v_2int00), v_dst01);
+                v_dst02 = v_fma(v_cvt_f64(v_1int01), v_cvt_f64(v_2int01), v_dst02);
+                v_dst03 = v_fma(v_cvt_f64_high(v_1int01), v_cvt_f64_high(v_2int01), v_dst03);
+                v_dst10 = v_fma(v_cvt_f64(v_1int10), v_cvt_f64(v_2int10), v_dst10);
+                v_dst11 = v_fma(v_cvt_f64_high(v_1int10), v_cvt_f64_high(v_2int10), v_dst11);
+                v_dst12 = v_fma(v_cvt_f64(v_1int11), v_cvt_f64(v_2int11), v_dst12);
+                v_dst13 = v_fma(v_cvt_f64_high(v_1int11), v_cvt_f64_high(v_2int11), v_dst13);
+                v_dst20 = v_fma(v_cvt_f64(v_1int20), v_cvt_f64(v_2int20), v_dst20);
+                v_dst21 = v_fma(v_cvt_f64_high(v_1int20), v_cvt_f64_high(v_2int20), v_dst21);
+                v_dst22 = v_fma(v_cvt_f64(v_1int21), v_cvt_f64(v_2int21), v_dst22);
+                v_dst23 = v_fma(v_cvt_f64_high(v_1int21), v_cvt_f64_high(v_2int21), v_dst23);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
  
  void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float32::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 8 ; x += 8)
+        {
+            __m256 v_1src = _mm256_loadu_ps(src1 + x);
+            __m256 v_2src = _mm256_loadu_ps(src2 + x);
+            __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));
+            __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));
+            __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));
+            __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));
+            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
+            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
+            __m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);
+            __m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);
+            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
+            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
+            _mm256_storeu_pd(dst + x, v_dst0);
+            _mm256_storeu_pd(dst + x + 4, v_dst1);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float32x4 v_1src  = v_load(src1 + x);
-            v_float32x4 v_2src  = v_load(src2 + x);
+            v_float32 v_1src  = vx_load(src1 + x);
+            v_float32 v_2src  = vx_load(src2 + x);
  
-            v_float64x2 v_1src0 = v_cvt_f64(v_1src);
-            v_float64x2 v_1src1 = v_cvt_f64_high(v_1src);
-            v_float64x2 v_2src0 = v_cvt_f64(v_2src);
-            v_float64x2 v_2src1 = v_cvt_f64_high(v_2src);
+            v_float64 v_1src0 = v_cvt_f64(v_1src);
+            v_float64 v_1src1 = v_cvt_f64_high(v_1src);
+            v_float64 v_2src0 = v_cvt_f64(v_2src);
+            v_float64 v_2src1 = v_cvt_f64_high(v_2src);
  
-            v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0));
-            v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1));
+            v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
          }
+        #endif // CV_AVX && !CV_AVX2
      }
      else
      {
-        v_uint32x4 v_0 = v_setzero_u32();
+        v_uint32 v_0 = vx_setzero_u32();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);
+                v_uint32 v_mask = vx_load_expand_q(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_float32x4 v_1src = v_load(src1 + x);
-                v_float32x4 v_2src = v_load(src2 + x);
+                v_float32 v_1src = vx_load(src1 + x);
+                v_float32 v_2src = vx_load(src2 + x);
                  v_1src = v_1src & v_reinterpret_as_f32(v_mask);
                  v_2src = v_2src & v_reinterpret_as_f32(v_mask);
  
-                v_float64x2 v_1src0 = v_cvt_f64(v_1src);
-                v_float64x2 v_1src1 = v_cvt_f64_high(v_1src);
-                v_float64x2 v_2src0 = v_cvt_f64(v_2src);
-                v_float64x2 v_2src1 = v_cvt_f64_high(v_2src);
+                v_float64 v_1src0 = v_cvt_f64(v_1src);
+                v_float64 v_1src1 = v_cvt_f64_high(v_1src);
+                v_float64 v_2src0 = v_cvt_f64(v_2src);
+                v_float64 v_2src1 = v_cvt_f64_high(v_2src);
  
-                v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0));
-                v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1));
+                v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
+                v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask = v_load_expand_q(mask + x);
+                v_uint32 v_mask = vx_load_expand_q(mask + x);
                  v_mask = ~(v_mask == v_0);
-                v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
+                v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                  v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                  v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
                  v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
@@ -2431,647 +2478,332 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
                  v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
                  v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
  
-                v_float64x2 v_src00 = v_cvt_f64(v_1src0) * v_cvt_f64(v_2src0);
-                v_float64x2 v_src01 = v_cvt_f64_high(v_1src0) * v_cvt_f64_high(v_2src0);
-                v_float64x2 v_src10 = v_cvt_f64(v_1src1) * v_cvt_f64(v_2src1);
-                v_float64x2 v_src11 = v_cvt_f64_high(v_1src1) * v_cvt_f64_high(v_2src1);
-                v_float64x2 v_src20 = v_cvt_f64(v_1src2) * v_cvt_f64(v_2src2);
-                v_float64x2 v_src21 = v_cvt_f64_high(v_1src2) * v_cvt_f64_high(v_2src2);
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_dst00 = v_fma(v_cvt_f64(v_1src0), v_cvt_f64(v_2src0), v_dst00);
+                v_dst01 = v_fma(v_cvt_f64_high(v_1src0), v_cvt_f64_high(v_2src0), v_dst01);
+                v_dst10 = v_fma(v_cvt_f64(v_1src1), v_cvt_f64(v_2src1), v_dst10);
+                v_dst11 = v_fma(v_cvt_f64_high(v_1src1), v_cvt_f64_high(v_2src1), v_dst11);
+                v_dst20 = v_fma(v_cvt_f64(v_1src2), v_cvt_f64(v_2src2), v_dst20);
+                v_dst21 = v_fma(v_cvt_f64_high(v_1src2), v_cvt_f64_high(v_2src2), v_dst21);
+
+                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
  
  void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
  {
      int x = 0;
-    const int cVectorWidth = 4;
+#if CV_SIMD_64F
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
+        #if CV_AVX && !CV_AVX2
+        for ( ; x <= size - 4 ; x += 4)
+        {
+            __m256d v_src0 = _mm256_loadu_pd(src1 + x);
+            __m256d v_src1 = _mm256_loadu_pd(src2 + x);
+            __m256d v_dst = _mm256_loadu_pd(dst + x);
+            v_src0 = _mm256_mul_pd(v_src0, v_src1);
+            v_dst = _mm256_add_pd(v_dst, v_src0);
+            _mm256_storeu_pd(dst + x, v_dst);
+        }
+        #else
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float64x2 v_src00 = v_load(src1 + x);
-            v_float64x2 v_src01 = v_load(src1 + x + 2);
-            v_float64x2 v_src10 = v_load(src2 + x);
-            v_float64x2 v_src11 = v_load(src2 + x + 2);
+            v_float64 v_src00 = vx_load(src1 + x);
+            v_float64 v_src01 = vx_load(src1 + x + step);
+            v_float64 v_src10 = vx_load(src2 + x);
+            v_float64 v_src11 = vx_load(src2 + x + step);
  
-            v_store(dst + x, v_load(dst + x) + (v_src00 * v_src10));
-            v_store(dst + x + 2, v_load(dst + x + 2) + (v_src01 * v_src11));
+            v_store(dst + x, v_fma(v_src00, v_src10, vx_load(dst + x)));
+            v_store(dst + x + step, v_fma(v_src01, v_src11, vx_load(dst + x + step)));
          }
+        #endif
      }
      else
      {
-        v_uint64x2 v_0 = v_setzero_u64();
+        // todo: try fma
+        v_uint64 v_0 = vx_setzero_u64();
          if (cn == 1)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float64x2 v_src00 = v_load(src1 + x);
-                v_float64x2 v_src01 = v_load(src1 + x + 2);
-                v_float64x2 v_src10 = v_load(src2 + x);
-                v_float64x2 v_src11 = v_load(src2 + x + 2);
+                v_float64 v_src00 = vx_load(src1 + x);
+                v_float64 v_src01 = vx_load(src1 + x + step);
+                v_float64 v_src10 = vx_load(src2 + x);
+                v_float64 v_src11 = vx_load(src2 + x + step);
  
-                v_store(dst + x, v_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
-                v_store(dst + x + 2, v_load(dst + x + 2) + ((v_src01 * v_src11) & v_mask1));
+                v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
+                v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
              }
          }
          else if (cn == 3)
          {
              for (; x <= len - cVectorWidth; x += cVectorWidth)
              {
-                v_uint32x4 v_mask32 = v_load_expand_q(mask + x);
-                v_uint64x2 v_masku640, v_masku641;
+                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
+                v_uint64 v_masku640, v_masku641;
                  v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
+                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
  
-                v_float64x2 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
-                v_float64x2 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
+                v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
+                v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
                  v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
-                v_load_deinterleave(src1 + (x + 2) * cn, v_1src01, v_1src11, v_1src21);
+                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
                  v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
-                v_load_deinterleave(src2 + (x + 2) * cn, v_2src01, v_2src11, v_2src21);
-                v_float64x2 v_src00 = (v_1src00 & v_mask0) * v_2src00;
-                v_float64x2 v_src01 = (v_1src01 & v_mask1) * v_2src01;
-                v_float64x2 v_src10 = (v_1src10 & v_mask0) * v_2src10;
-                v_float64x2 v_src11 = (v_1src11 & v_mask1) * v_2src11;
-                v_float64x2 v_src20 = (v_1src20 & v_mask0) * v_2src20;
-                v_float64x2 v_src21 = (v_1src21 & v_mask1) * v_2src21;
-
-                v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
+                v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
+                v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
+                v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
+                v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
+                v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
+                v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
+
+                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                  v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
-                v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
  
                  v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
              }
          }
      }
-
+#endif // CV_SIMD_64F
      accProd_general_(src1, src2, dst, mask, len, cn, x);
  }
-#else
-void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-
-void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-
-void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-
-void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    accProd_general_(src1, src2, dst, mask, len, cn, 0);
-}
-#endif
  
  // running weight accumulate optimized by universal intrinsic
  void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
  {
      int x = 0;
-    const v_float32x4 v_alpha = v_setall_f32((float)alpha);
-    const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = 16;
+#if CV_SIMD
+    const v_float32 v_alpha = vx_setall_f32((float)alpha);
+    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
+    const int cVectorWidth = v_uint8::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint8x16 v_src = v_load(src + x);
+            v_uint8 v_src = vx_load(src + x);
  
-            v_uint16x8 v_src0, v_src1;
+            v_uint16 v_src0, v_src1;
              v_expand(v_src, v_src0, v_src1);
  
-            v_uint32x4 v_src00, v_src01, v_src10, v_src11;
+            v_uint32 v_src00, v_src01, v_src10, v_src11;
              v_expand(v_src0, v_src00, v_src01);
              v_expand(v_src1, v_src10, v_src11);
  
-            v_float32x4 v_dst00 = v_load(dst + x);
-            v_float32x4 v_dst01 = v_load(dst + x + 4);
-            v_float32x4 v_dst10 = v_load(dst + x + 8);
-            v_float32x4 v_dst11 = v_load(dst + x + 12);
+            v_float32 v_dst00 = vx_load(dst + x);
+            v_float32 v_dst01 = vx_load(dst + x + step);
+            v_float32 v_dst10 = vx_load(dst + x + step * 2);
+            v_float32 v_dst11 = vx_load(dst + x + step * 3);
  
-            v_dst00 = (v_dst00 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
-            v_dst01 = (v_dst01 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
-            v_dst10 = (v_dst10 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
-            v_dst11 = (v_dst11 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
+            v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
+            v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
+            v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
  
              v_store(dst + x, v_dst00);
-            v_store(dst + x + 4, v_dst01);
-            v_store(dst + x + 8, v_dst10);
-            v_store(dst + x + 12, v_dst11);
+            v_store(dst + x + step, v_dst01);
+            v_store(dst + x + step * 2, v_dst10);
+            v_store(dst + x + step * 3, v_dst11);
          }
      }
-
+#endif // CV_SIMD
      accW_general_(src, dst, mask, len, cn, alpha, x);
  }
  
  void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
  {
      int x = 0;
-    const v_float32x4 v_alpha = v_setall_f32((float)alpha);
-    const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = 8;
+#if CV_SIMD
+    const v_float32 v_alpha = vx_setall_f32((float)alpha);
+    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_int0, v_int1;
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_int0, v_int1;
              v_expand(v_src, v_int0, v_int1);
  
-            v_float32x4 v_src0 = v_cvt_f32(v_reinterpret_as_s32(v_int0));
-            v_float32x4 v_src1 = v_cvt_f32(v_reinterpret_as_s32(v_int1));
-            v_src0 = v_src0 * v_alpha;
-            v_src1 = v_src1 * v_alpha;
+            v_float32 v_dst0 = vx_load(dst + x);
+            v_float32 v_dst1 = vx_load(dst + x + step);
+            v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
  
-            v_float32x4 v_dst0 = v_load(dst + x) * v_beta;
-            v_float32x4 v_dst1 = v_load(dst + x + 4) * v_beta;
-
-            v_store(dst + x, v_dst0 + v_src0);
-            v_store(dst + x + 4, v_dst1 + v_src1);
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
          }
      }
-
+#endif // CV_SIMD
      accW_general_(src, dst, mask, len, cn, alpha, x);
  }
  
  void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
  {
      int x = 0;
-    const v_float32x4 v_alpha = v_setall_f32((float)alpha);
-    const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = 8;
+#if CV_AVX && !CV_AVX2
+    const __m256 v_alpha = _mm256_set1_ps((float)alpha);
+    const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha));
+    const int cVectorWidth = 16;
  
      if (!mask)
      {
          int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
          {
-            v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_load(src + x) * v_alpha)));
-            v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_load(src + x + 4) * v_alpha)));
-        }
+            _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
+            _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
+        }
      }
-
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
-#if CV_SIMD128_64F
-void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 8;
+#elif CV_SIMD
+    const v_float32 v_alpha = vx_setall_f32((float)alpha);
+    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float32::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src16 = v_load_expand(src + x);
+            v_float32 v_dst0 = vx_load(dst + x);
+            v_float32 v_dst1 = vx_load(dst + x + step);
  
-            v_uint32x4 v_int_0, v_int_1;
-            v_expand(v_src16, v_int_0, v_int_1);
-
-            v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-            v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
-
-            v_float64x2 v_src0 = v_cvt_f64(v_int0);
-            v_float64x2 v_src1 = v_cvt_f64_high(v_int0);
-            v_float64x2 v_src2 = v_cvt_f64(v_int1);
-            v_float64x2 v_src3 = v_cvt_f64_high(v_int1);
-
-            v_float64x2 v_dst0 = v_load(dst + x);
-            v_float64x2 v_dst1 = v_load(dst + x + 2);
-            v_float64x2 v_dst2 = v_load(dst + x + 4);
-            v_float64x2 v_dst3 = v_load(dst + x + 6);
-
-            v_dst0 = (v_dst0 * v_beta) + (v_src0 * v_alpha);
-            v_dst1 = (v_dst1 * v_beta) + (v_src1 * v_alpha);
-            v_dst2 = (v_dst2 * v_beta) + (v_src2 * v_alpha);
-            v_dst3 = (v_dst3 * v_beta) + (v_src3 * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);
  
              v_store(dst + x, v_dst0);
-            v_store(dst + x + 2, v_dst1);
-            v_store(dst + x + 4, v_dst2);
-            v_store(dst + x + 6, v_dst3);
+            v_store(dst + x + step, v_dst1);
          }
      }
-
+#endif // CV_SIMD
      accW_general_(src, dst, mask, len, cn, alpha, x);
  }
  
-void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
  {
      int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_uint16x8 v_src = v_load(src + x);
-            v_uint32x4 v_int_0, v_int_1;
-            v_expand(v_src, v_int_0, v_int_1);
+            v_uint16 v_src16 = vx_load_expand(src + x);
  
-            v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0);
-            v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1);
+            v_uint32 v_int_0, v_int_1;
+            v_expand(v_src16, v_int_0, v_int_1);
  
-            v_float64x2 v_src00 = v_cvt_f64(v_int0);
-            v_float64x2 v_src01 = v_cvt_f64_high(v_int0);
-            v_float64x2 v_src10 = v_cvt_f64(v_int1);
-            v_float64x2 v_src11 = v_cvt_f64_high(v_int1);
+            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
  
-            v_float64x2 v_dst00 = v_load(dst + x);
-            v_float64x2 v_dst01 = v_load(dst + x + 2);
-            v_float64x2 v_dst10 = v_load(dst + x + 4);
-            v_float64x2 v_dst11 = v_load(dst + x + 6);
+            v_float64 v_src0 = v_cvt_f64(v_int0);
+            v_float64 v_src1 = v_cvt_f64_high(v_int0);
+            v_float64 v_src2 = v_cvt_f64(v_int1);
+            v_float64 v_src3 = v_cvt_f64_high(v_int1);
  
-            v_dst00 = (v_dst00 * v_beta) + (v_src00 * v_alpha);
-            v_dst01 = (v_dst01 * v_beta) + (v_src01 * v_alpha);
-            v_dst10 = (v_dst10 * v_beta) + (v_src10 * v_alpha);
-            v_dst11 = (v_dst11 * v_beta) + (v_src11 * v_alpha);
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+            v_float64 v_dst2 = vx_load(dst + x + step * 2);
+            v_float64 v_dst3 = vx_load(dst + x + step * 3);
  
-            v_store(dst + x, v_dst00);
-            v_store(dst + x + 2, v_dst01);
-            v_store(dst + x + 4, v_dst10);
-            v_store(dst + x + 6, v_dst11);
+            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
+            v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);
+            v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+            v_store(dst + x + step * 2, v_dst2);
+            v_store(dst + x + step * 3, v_dst3);
          }
      }
-
+#endif // CV_SIMD_64F
      accW_general_(src, dst, mask, len, cn, alpha, x);
  }
  
-void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
  {
      int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 8;
+#if CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_uint16::nlanes;
+    const int step = v_float64::nlanes;
  
      if (!mask)
      {
          int size = len * cn;
          for (; x <= size - cVectorWidth; x += cVectorWidth)
          {
-            v_float32x4 v_src0 = v_load(src + x);
-            v_float32x4 v_src1 = v_load(src + x + 4);
-            v_float64x2 v_src00 = v_cvt_f64(v_src0);
-            v_float64x2 v_src01 = v_cvt_f64_high(v_src0);
-            v_float64x2 v_src10 = v_cvt_f64(v_src1);
-            v_float64x2 v_src11 = v_cvt_f64_high(v_src1);
+            v_uint16 v_src = vx_load(src + x);
+            v_uint32 v_int_0, v_int_1;
+            v_expand(v_src, v_int_0, v_int_1);
  
-            v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src00 * v_alpha)));
-            v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src01 * v_alpha)));
-            v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_src10 * v_alpha)));
-            v_store(dst + x + 6, ((v_load(dst + x + 6) * v_beta) + (v_src11 * v_alpha)));
-        }
-    }
+            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
+            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
  
-    accW_general_(src, dst, mask, len, cn, alpha, x);
-}
+            v_float64 v_src00 = v_cvt_f64(v_int0);
+            v_float64 v_src01 = v_cvt_f64_high(v_int0);
+            v_float64 v_src10 = v_cvt_f64(v_int1);
+            v_float64 v_src11 = v_cvt_f64_high(v_int1);
  
-void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const v_float64x2 v_alpha = v_setall_f64(alpha);
-    const v_float64x2 v_beta = v_setall_f64(1.0f - alpha);
-    const int cVectorWidth = 4;
+            v_float64 v_dst00 = vx_load(dst + x);
+            v_float64 v_dst01 = vx_load(dst + x + step);
+            v_float64 v_dst10 = vx_load(dst + x + step * 2);
+            v_float64 v_dst11 = vx_load(dst + x + step * 3);
  
-    if (!mask)
-    {
-        int size = len * cn;
-        for (; x <= size - cVectorWidth; x += cVectorWidth)
-        {
-            v_float64x2 v_src0 = v_load(src + x);
-            v_float64x2 v_src1 = v_load(src + x + 2);
+            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
+            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
+            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
+            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
  
-            v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src0 * v_alpha)));
-            v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src1 * v_alpha)));
+            v_store(dst + x, v_dst00);
+            v_store(dst + x + step, v_dst01);
+            v_store(dst + x + step * 2, v_dst10);
+            v_store(dst + x + step * 3, v_dst11);
          }
      }
-
+#endif // CV_SIMD_64F
      accW_general_(src, dst, mask, len, cn, alpha, x);
  }
-#else
-void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-
-void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
  
  void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
  {
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-
-void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    accW_general_(src, dst, mask, len, cn, alpha, 0);
-}
-#endif // CV_SIMD128_64F
-#endif // CV_SIMD128
-#if CV_AVX
-// accumulate optimized by AVX
-void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256 v_dst = _mm256_loadu_ps(dst + x);
-            v_dst = _mm256_add_ps(v_src, v_dst);
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-        acc_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        acc_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));
-            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));
-            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
-            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
-            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
-            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
-            _mm256_storeu_pd(dst + x, v_dst0);
-            _mm256_storeu_pd(dst + x + 4, v_dst1);
-        }
-        acc_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        acc_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256d v_src = _mm256_loadu_pd(src + x);
-            __m256d v_dst = _mm256_loadu_pd(dst + x);
-            v_dst = _mm256_add_pd(v_dst, v_src);
-            _mm256_storeu_pd(dst + x, v_dst);
-        }
-        acc_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        acc_simd_(src, dst, mask, len, cn);
-    }
-}
-
-// square accumulate optimized by avx
-void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256 v_dst = _mm256_loadu_ps(dst + x);
-            v_src = _mm256_mul_ps(v_src, v_src);
-            v_dst = _mm256_add_ps(v_src, v_dst);
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-        accSqr_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accSqr_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src = _mm256_loadu_ps(src + x);
-            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
-            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
-            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
-            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
-            v_src0 = _mm256_mul_pd(v_src0, v_src0);
-            v_src1 = _mm256_mul_pd(v_src1, v_src1);
-            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
-            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
-            _mm256_storeu_pd(dst + x, v_dst0);
-            _mm256_storeu_pd(dst + x + 4, v_dst1);
-        }
-        accSqr_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accSqr_simd_(src, dst, mask, len, cn);
-    }
-}
-
-void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256d v_src = _mm256_loadu_pd(src + x);
-            __m256d v_dst = _mm256_loadu_pd(dst + x);
-            v_src = _mm256_mul_pd(v_src, v_src);
-            v_dst = _mm256_add_pd(v_dst, v_src);
-            _mm256_storeu_pd(dst + x, v_dst);
-        }
-        accSqr_general_(src, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accSqr_simd_(src, dst, mask, len, cn);
-    }
-}
-
-// product accumulate optimized by avx
-void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_src0 = _mm256_loadu_ps(src1 + x);
-            __m256 v_src1 = _mm256_loadu_ps(src2 + x);
-            __m256 v_dst = _mm256_loadu_ps(dst + x);
-            __m256 v_src = _mm256_mul_ps(v_src0, v_src1);
-            v_dst = _mm256_add_ps(v_src, v_dst);
-            _mm256_storeu_ps(dst + x, v_dst);
-        }
-        accProd_general_(src1, src2, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accProd_simd_(src1, src2, dst, mask, len, cn);
-    }
-}
-
-void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 8;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256 v_1src = _mm256_loadu_ps(src1 + x);
-            __m256 v_2src = _mm256_loadu_ps(src2 + x);
-            __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));
-            __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));
-            __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));
-            __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));
-            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
-            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
-            __m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);
-            __m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);
-            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
-            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
-            _mm256_storeu_pd(dst + x, v_dst0);
-            _mm256_storeu_pd(dst + x + 4, v_dst1);
-        }
-        accProd_general_(src1, src2, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accProd_simd_(src1, src2, dst, mask, len, cn);
-    }
-}
-
-void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
-{
-    int x = 0;
-    const int cVectorWidth = 4;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            __m256d v_src0 = _mm256_loadu_pd(src1 + x);
-            __m256d v_src1 = _mm256_loadu_pd(src2 + x);
-            __m256d v_dst = _mm256_loadu_pd(dst + x);
-            v_src0 = _mm256_mul_pd(v_src0, v_src1);
-            v_dst = _mm256_add_pd(v_dst, v_src0);
-            _mm256_storeu_pd(dst + x, v_dst);
-        }
-        accProd_general_(src1, src2, dst, mask, len, cn, x);
-    }
-    else
-    {
-        accProd_simd_(src1, src2, dst, mask, len, cn);
-    }
-}
-
-// running weight accumulate optimized by avx
-void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
-{
-    int x = 0;
-    const __m256 v_alpha = _mm256_set1_ps((float)alpha);
-    const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha));
-    const int cVectorWidth = 16;
-
-    if (!mask)
-    {
-        int size = len * cn;
-        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
-        {
-            _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
-            _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
-        }
-        accW_general_(src, dst, mask, len, cn, alpha, x);
-    }
-    else
-    {
-        accW_simd_(src, dst, mask, len, cn, alpha);
-    }
-
-}
-
-void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
-{
      int x = 0;
+#if CV_AVX && !CV_AVX2
      const __m256d v_alpha = _mm256_set1_pd(alpha);
      const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
      const int cVectorWidth = 16;
@@ -3093,17 +2825,49 @@ void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len,
              _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));
              _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
          }
-        accW_general_(src, dst, mask, len, cn, alpha, x);
      }
-    else
+#elif CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_float32::nlanes * 2;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
      {
-        accW_simd_(src, dst, mask, len, cn, alpha);
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float32 v_src0 = vx_load(src + x);
+            v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);
+            v_float64 v_src00 = v_cvt_f64(v_src0);
+            v_float64 v_src01 = v_cvt_f64_high(v_src0);
+            v_float64 v_src10 = v_cvt_f64(v_src1);
+            v_float64 v_src11 = v_cvt_f64_high(v_src1);
+
+            v_float64 v_dst00 = vx_load(dst + x);
+            v_float64 v_dst01 = vx_load(dst + x + step);
+            v_float64 v_dst10 = vx_load(dst + x + step * 2);
+            v_float64 v_dst11 = vx_load(dst + x + step * 3);
+
+            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
+            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
+            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
+            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+
+            v_store(dst + x, v_dst00);
+            v_store(dst + x + step, v_dst01);
+            v_store(dst + x + step * 2, v_dst10);
+            v_store(dst + x + step * 3, v_dst11);
+        }
      }
+#endif // CV_SIMD_64F
+    accW_general_(src, dst, mask, len, cn, alpha, x);
  }
  
-void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
+void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
  {
      int x = 0;
+#if CV_AVX && !CV_AVX2
      const __m256d v_alpha = _mm256_set1_pd(alpha);
      const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
      const int cVectorWidth = 8;
@@ -3119,14 +2883,35 @@ void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, in
              _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));
              _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
          }
-        accW_general_(src, dst, mask, len, cn, alpha, x);
      }
-    else
+#elif CV_SIMD_64F
+    const v_float64 v_alpha = vx_setall_f64(alpha);
+    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
+    const int cVectorWidth = v_float64::nlanes * 2;
+    const int step = v_float64::nlanes;
+
+    if (!mask)
      {
-        accW_simd_(src, dst, mask, len, cn, alpha);
+        int size = len * cn;
+        for (; x <= size - cVectorWidth; x += cVectorWidth)
+        {
+            v_float64 v_src0 = vx_load(src + x);
+            v_float64 v_src1 = vx_load(src + x + step);
+
+            v_float64 v_dst0 = vx_load(dst + x);
+            v_float64 v_dst1 = vx_load(dst + x + step);
+
+            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
+            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
+
+            v_store(dst + x, v_dst0);
+            v_store(dst + x + step, v_dst1);
+        }
      }
+#endif // CV_SIMD_64F
+    accW_general_(src, dst, mask, len, cn, alpha, x);
  }
-#endif
+
  #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
  
  CV_CPU_OPTIMIZATION_NAMESPACE_END
author	Sayed Adel <seiko@imavr.com>
	Tue, 2 Oct 2018 18:44:03 +0000 (20:44 +0200)
committer	Sayed Adel <seiko@imavr.com>
	Thu, 11 Oct 2018 02:37:12 +0000 (04:37 +0200)
modules/imgproc/CMakeLists.txt		patch \| blob \| history
modules/imgproc/perf/perf_accumulate.cpp		patch \| blob \| history
modules/imgproc/src/accum.simd.hpp		patch \| blob \| history