From: Anna Khakimova <anna.khakimova@intel.com>
Date: Tue, 9 Nov 2021 09:44:11 +0000 (+0300)
Subject: Fluid: SIMD multiply kernel
X-Git-Tag: accepted/tizen/unified/20230127.161057~1^2~519^2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c47673bf10ab620d28de8fe1d1d9b4e264221f0f;p=platform%2Fupstream%2Fopencv.git

Fluid: SIMD multiply kernel
---

diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
index 0ae0210..f5916a6 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@@ -32,7 +32,7 @@ namespace opencv_test
     class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
     class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
     class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
-    class MulPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
     class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
     class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
     class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
index 937d49f..fbbda1a 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@@ -208,19 +208,23 @@ PERF_TEST_P_(SubRCPerfTest, TestPerformance)
 
 PERF_TEST_P_(MulPerfTest, TestPerformance)
 {
-    Size sz = get<0>(GetParam());
-    MatType type = get<1>(GetParam());
-    int dtype = get<2>(GetParam());
-    cv::GCompileArgs compile_args = get<3>(GetParam());
+    compare_f cmpF;
+    cv::Size sz;
+    MatType type = -1;
+    int dtype = -1;
+    double scale = 1.0;
+    cv::GCompileArgs compile_args;
+
+    std::tie(cmpF, sz, type, dtype, scale, compile_args) = GetParam();
 
     initMatsRandU(type, sz, dtype, false);
 
     // OpenCV code ///////////////////////////////////////////////////////////
-    cv::multiply(in_mat1, in_mat2, out_mat_ocv, 1.0, dtype);
+    cv::multiply(in_mat1, in_mat2, out_mat_ocv, scale, dtype);
 
     // G-API code ////////////////////////////////////////////////////////////
     cv::GMat in1, in2, out;
-    out = cv::gapi::mul(in1, in2, 1.0, dtype);
+    out = cv::gapi::mul(in1, in2, scale, dtype);
     cv::GComputation c(GIn(in1, in2), GOut(out));
 
     // Warm-up graph engine:
@@ -234,8 +238,9 @@ PERF_TEST_P_(MulPerfTest, TestPerformance)
     }
 
     // Comparison ////////////////////////////////////////////////////////////
-    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
-    EXPECT_EQ(out_mat_gapi.size(), sz);
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
 
     SANITY_CHECK_NOTHING();
 }
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
index 51b76fb..09196fd 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@@ -46,9 +46,11 @@ INSTANTIATE_TEST_CASE_P(SubRCPerfTestCPU, SubRCPerfTest,
         Values(cv::compile_args(CORE_CPU))));
 
 INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest,
-    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
         Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(2.0),
         Values(cv::compile_args(CORE_CPU))));
 
 INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
index ffb46d1..6c80231 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@@ -42,11 +42,13 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
 //         Values(-1, CV_8U, CV_16U, CV_32F),
 //         Values(cv::compile_args(CORE_FLUID))));
 
-// INSTANTIATE_TEST_CASE_P(MulPerfTestFluid, MulPerfTest,
-//     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-//         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-//         Values(-1, CV_8U, CV_16U, CV_32F),
-//         Values(cv::compile_args(CORE_FLUID))));
+ INSTANTIATE_TEST_CASE_P(MulPerfTestFluid, MulPerfTest,
+     Combine(Values(AbsExact().to_compare_f()),
+         Values(szSmall128, szVGA, sz720p, sz1080p),
+         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+         Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
+         Values(2.0),
+         Values(cv::compile_args(CORE_FLUID))));
 
 // INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
 //     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
index d2269c0..0b260bf 100644
--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@@ -44,9 +44,11 @@ INSTANTIATE_TEST_CASE_P(SubRCPerfTestGPU, SubRCPerfTest,
                                 Values(cv::compile_args(CORE_GPU))));
 
 INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest,
-                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
                                 Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
                                 Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(2.0),
                                 Values(cv::compile_args(CORE_GPU))));
 
 INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index 3e81dfc..d68ae73 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -684,9 +684,14 @@ static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm a
             break;
         }
         case ARITHM_MULTIPLY:
+        {
+#if CV_SIMD
+            x = mul_simd(in1, in2, out, length, scale);
+#endif
             for (; x < length; ++x)
                 out[x] = mul<DST>(in1[x], in2[x], _scale);
             break;
+        }
         case ARITHM_DIVIDE:
         {
 #if CV_SIMD
@@ -745,13 +750,22 @@ GAPI_FLUID_KERNEL(GFluidMul, cv::gapi::core::GMul, false)
     static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst)
     {
         //      DST     SRC1    SRC2    OP          __VA_ARGS__
-        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
-        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
-        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
-        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
-        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
-        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
-        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(uchar,  uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(uchar,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(uchar,  short,  short,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(uchar,  float,  float,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(short,  short,  short,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(short,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(short,  uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(short,  float,  float,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(ushort, uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(ushort, short,  short,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(ushort, float,  float,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(float,  uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(float,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(float,  short,  short,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(float,  float,  float,  run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index 814c881..297c065 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -56,6 +56,35 @@ DIV_SIMD(float, float)
 
 #undef DIV_SIMD
 
+
+#define MUL_SIMD(SRC, DST)                                                  \
+int mul_simd(const SRC in1[], const SRC in2[], DST out[],                   \
+             const int length, double _scale)                               \
+{                                                                           \
+    CV_CPU_DISPATCH(mul_simd, (in1, in2, out, length, _scale),              \
+                    CV_CPU_DISPATCH_MODES_ALL);                             \
+}
+
+
+MUL_SIMD(uchar, uchar)
+MUL_SIMD(ushort, uchar)
+MUL_SIMD(short, uchar)
+MUL_SIMD(float, uchar)
+MUL_SIMD(short, short)
+MUL_SIMD(ushort, short)
+MUL_SIMD(uchar, short)
+MUL_SIMD(float, short)
+MUL_SIMD(ushort, ushort)
+MUL_SIMD(uchar, ushort)
+MUL_SIMD(short, ushort)
+MUL_SIMD(float, ushort)
+MUL_SIMD(uchar, float)
+MUL_SIMD(ushort, float)
+MUL_SIMD(short, float)
+MUL_SIMD(float, float)
+
+#undef MUL_SIMD
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index f66aeeb..3ae41c6 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -37,6 +37,29 @@ DIV_SIMD(float, float)
 
 #undef DIV_SIMD
 
+#define MUL_SIMD(SRC, DST)                                       \
+int mul_simd(const SRC in1[], const SRC in2[], DST out[],        \
+             const int length, double _scale);
+
+MUL_SIMD(uchar, uchar)
+MUL_SIMD(ushort, uchar)
+MUL_SIMD(short, uchar)
+MUL_SIMD(float, uchar)
+MUL_SIMD(short, short)
+MUL_SIMD(ushort, short)
+MUL_SIMD(uchar, short)
+MUL_SIMD(float, short)
+MUL_SIMD(ushort, ushort)
+MUL_SIMD(uchar, ushort)
+MUL_SIMD(short, ushort)
+MUL_SIMD(float, ushort)
+MUL_SIMD(uchar, float)
+MUL_SIMD(ushort, float)
+MUL_SIMD(short, float)
+MUL_SIMD(float, float)
+
+#undef MUL_SIMD
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
index b6fd645..5139d54 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -58,6 +58,29 @@ DIV_SIMD(float, float)
 
 #undef DIV_SIMD
 
+#define MUL_SIMD(SRC, DST)                                     \
+int mul_simd(const SRC in1[], const SRC in2[], DST out[],      \
+             const int length, double _scale);
+
+MUL_SIMD(uchar, uchar)
+MUL_SIMD(ushort, uchar)
+MUL_SIMD(short, uchar)
+MUL_SIMD(float, uchar)
+MUL_SIMD(short, short)
+MUL_SIMD(ushort, short)
+MUL_SIMD(uchar, short)
+MUL_SIMD(float, short)
+MUL_SIMD(ushort, ushort)
+MUL_SIMD(uchar, ushort)
+MUL_SIMD(short, ushort)
+MUL_SIMD(float, ushort)
+MUL_SIMD(uchar, float)
+MUL_SIMD(ushort, float)
+MUL_SIMD(short, float)
+MUL_SIMD(float, float)
+
+#undef MUL_SIMD
+
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 struct scale_tag {};
@@ -93,6 +116,16 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
     return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
 }
 
+CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale)
+{
+    return (scale*a * b);
+}
+
+CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&)
+{
+    return a * b;
+}
+
 CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
 {
     return (a*scale/div);
@@ -103,12 +136,12 @@ CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_flo
     return a / div;
 }
 
-CV_ALWAYS_INLINE void v_store_div(short* dst, v_int32& res1, v_int32& res2)
+CV_ALWAYS_INLINE void v_store_i16(short* dst, v_int32& res1, v_int32& res2)
 {
     vx_store(dst, v_pack(res1, res2));
 }
 
-CV_ALWAYS_INLINE void v_store_div(ushort* dst, v_int32& res1, v_int32& res2)
+CV_ALWAYS_INLINE void v_store_i16(ushort* dst, v_int32& res1, v_int32& res2)
 {
     vx_store(dst, v_pack_u(res1, res2));
 }
@@ -360,7 +393,7 @@ div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const in
             v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
             v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
 
-            v_store_div(&out[x], res1, res2);
+            v_store_i16(&out[x], res1, res2);
         }
 
         if (x < length)
@@ -467,6 +500,327 @@ DIV_SIMD(float, float)
 
 #undef DIV_SIMD
 
+//-------------------------
+//
+// Fluid kernels: Multiply
+//
+//-------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
+                        (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
+                        (std::is_same<SRC, short>::value && std::is_same<DST, short>::value) ||
+                        (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
+mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_int16 a = v_reinterpret_as_s16(vx_load(&in1[x]));
+            v_int16 b = v_reinterpret_as_s16(vx_load(&in2[x]));
+
+            v_float32 a1 = v_cvt_f32(v_expand_low(a));
+            v_float32 a2 = v_cvt_f32(v_expand_high(a));
+
+            v_float32 b1 = v_cvt_f32(v_expand_low(b));
+            v_float32 b2 = v_cvt_f32(v_expand_high(b));
+
+            v_int32 r1 = v_round(mul_op(t, a1, b1, scale));
+            v_int32 r2 = v_round(mul_op(t, a2, b2, scale));
+
+            v_store_i16(&out[x], r1, r2);
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<SRC, short>::value ||
+                        std::is_same<SRC, ushort>::value, int>::type
+mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
+{
+    constexpr int nlanes = v_uint8::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_int16 a1 = v_reinterpret_as_s16(vx_load(&in1[x]));
+            v_int16 a2 = v_reinterpret_as_s16(vx_load(&in1[x + nlanes / 2]));
+
+            v_float32 fa1 = v_cvt_f32(v_expand_low(a1));
+            v_float32 fa2 = v_cvt_f32(v_expand_high(a1));
+            v_float32 fa3 = v_cvt_f32(v_expand_low(a2));
+            v_float32 fa4 = v_cvt_f32(v_expand_high(a2));
+
+            v_int16 b1 = v_reinterpret_as_s16(vx_load(&in2[x]));
+            v_int16 b2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2]));
+
+            v_float32 fb1 = v_cvt_f32(v_expand_low(b1));
+            v_float32 fb2 = v_cvt_f32(v_expand_high(b1));
+            v_float32 fb3 = v_cvt_f32(v_expand_low(b2));
+            v_float32 fb4 = v_cvt_f32(v_expand_high(b2));
+
+            v_int32 sum1 = v_round(mul_op(t, fa1, fb1, scale)),
+                    sum2 = v_round(mul_op(t, fa2, fb2, scale)),
+                    sum3 = v_round(mul_op(t, fa3, fb3, scale)),
+                    sum4 = v_round(mul_op(t, fa4, fb4, scale));
+
+            v_int16 res1 = v_pack(sum1, sum2);
+            v_int16 res2 = v_pack(sum3, sum4);
+
+            vx_store(&out[x], v_pack_u(res1, res2));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t>
+CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
+                             const int length, double _scale)
+{
+    constexpr int nlanes = v_uint8::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = vg_load_f32(&in1[x]);
+            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
+            v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
+            v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
+
+            v_float32 b1 = vg_load_f32(&in2[x]);
+            v_float32 b2 = vg_load_f32(&in2[x + nlanes / 4]);
+            v_float32 b3 = vg_load_f32(&in2[x + nlanes / 2]);
+            v_float32 b4 = vg_load_f32(&in2[x + 3 * nlanes / 4]);
+
+            v_int32 res1 = v_round(mul_op(t, a1, b1, scale));
+            v_int32 res2 = v_round(mul_op(t, a2, b2, scale));
+            v_int32 res3 = v_round(mul_op(t, a3, b3, scale));
+            v_int32 res4 = v_round(mul_op(t, a4, b4, scale));
+
+            vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4)));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+template<typename scale_tag_t, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+                        std::is_same<DST, ushort>::value, int>::type
+mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_int16 a = v_reinterpret_as_s16(vx_load_expand(&in1[x]));
+            v_int16 b = v_reinterpret_as_s16(vx_load_expand(&in2[x]));
+
+            v_float32 a1 = v_cvt_f32(v_expand_low(a));
+            v_float32 a2 = v_cvt_f32(v_expand_high(a));
+
+            v_float32 b1 = v_cvt_f32(v_expand_low(b));
+            v_float32 b2 = v_cvt_f32(v_expand_high(b));
+
+            v_int32 r1 = v_round(mul_op(t, a1, b1, scale));
+            v_int32 r2 = v_round(mul_op(t, a2, b2, scale));
+
+            v_store_i16(&out[x], r1, r2);
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+                        std::is_same<DST, ushort>::value, int>::type
+mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = vg_load_f32(&in1[x]);
+            v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
+
+            v_float32 b1 = vg_load_f32(&in2[x]);
+            v_float32 b2 = vg_load_f32(&in2[x + nlanes / 2]);
+
+            v_int32 res1 = v_round(mul_op(t, a1, b1, scale));
+            v_int32 res2 = v_round(mul_op(t, a2, b2, scale));
+
+            v_store_i16(&out[x], res1, res2);
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
+                             const int length, double _scale)
+{
+    constexpr int nlanes = v_float32::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = vg_load_f32(&in1[x]);
+            v_float32 b1 = vg_load_f32(&in2[x]);
+
+            vx_store(&out[x], mul_op(t, a1, b1, scale));
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process one more time (unaligned tail)
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t>
+CV_ALWAYS_INLINE int mul_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[],
+                             const int length, double scale)
+{
+    hal::mul8u(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
+               out, static_cast<size_t>(length), length, 1, &scale);
+    return length;
+}
+
+#define MUL_SIMD(SRC, DST)                                                      \
+int mul_simd(const SRC in1[], const SRC in2[], DST out[],                       \
+             const int length, double _scale)                                   \
+{                                                                               \
+    int x = 0;                                                                  \
+    float fscale = static_cast<float>(_scale);                                  \
+    if (std::fabs(fscale - 1.0f) <= FLT_EPSILON)                                \
+    {                                                                           \
+        not_scale_tag t;                                                        \
+        x = mul_hal(t, in1, in2, out, length, _scale);                          \
+    }                                                                           \
+    else                                                                        \
+    {                                                                           \
+        scale_tag t;                                                            \
+        x = mul_hal(t, in1, in2, out, length, _scale);                          \
+    }                                                                           \
+    return x;                                                                   \
+}
+
+MUL_SIMD(uchar, uchar)
+MUL_SIMD(ushort, uchar)
+MUL_SIMD(short, uchar)
+MUL_SIMD(float, uchar)
+MUL_SIMD(short, short)
+MUL_SIMD(ushort, short)
+MUL_SIMD(uchar, short)
+MUL_SIMD(float, short)
+MUL_SIMD(ushort, ushort)
+MUL_SIMD(uchar, ushort)
+MUL_SIMD(short, ushort)
+MUL_SIMD(float, ushort)
+MUL_SIMD(uchar, float)
+MUL_SIMD(ushort, float)
+MUL_SIMD(short, float)
+MUL_SIMD(float, float)
+
+#undef MUL_SIMD
+
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END