GAPI Fluid: Enable dynamic dispatching for the Sub kernel.
authorAnna Khakimova <anna.khakimova@intel.com>
Thu, 17 Mar 2022 18:33:50 +0000 (21:33 +0300)
committerAnna Khakimova <anna.khakimova@intel.com>
Fri, 18 Mar 2022 09:34:19 +0000 (12:34 +0300)
modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
modules/gapi/src/backends/fluid/gfluidcore.cpp
modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
modules/gapi/src/backends/fluid/gfluidcore_func.hpp
modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp

index c644fd1..83ef130 100644 (file)
@@ -436,7 +436,7 @@ PERF_TEST_P_(DivPerfTest, TestPerformance)
     // FIXIT Unstable input data for divide
     initMatsRandU(type, sz, dtype, false);
 
-    //This condition need to workaround issue in the OpenCV.
+    //This condition need to workaround the #21044 issue in the OpenCV.
     //It reinitializes divider matrix without zero values for CV_16S DST type.
     if (dtype == CV_16S && dtype != type)
         cv::randu(in_mat2, cv::Scalar::all(1), cv::Scalar::all(255));
@@ -482,7 +482,7 @@ PERF_TEST_P_(DivCPerfTest, TestPerformance)
     // FIXIT Unstable input data for divide
     initMatsRandU(type, sz, dtype, false);
 
-    //This condition need as workaround the issue in the OpenCV.
+    //This condition need to workaround the #21044 issue in the OpenCV.
     //It reinitializes divider scalar without zero values for CV_16S DST type.
     if (dtype == CV_16S || (type == CV_16S && dtype == -1))
         cv::randu(sc, cv::Scalar::all(1), cv::Scalar::all(SHRT_MAX));
@@ -528,7 +528,7 @@ PERF_TEST_P_(DivRCPerfTest, TestPerformance)
 
     // FIXIT Unstable input data for divide
     initMatsRandU(type, sz, dtype, false);
-    //This condition need as workaround the bug in the OpenCV.
+    //This condition need to workaround the #21044 issue in the OpenCV.
     //It reinitializes divider matrix without zero values for CV_16S DST type.
     if (dtype == CV_16S || (type == CV_16S && dtype == -1))
         cv::randu(in_mat1, cv::Scalar::all(1), cv::Scalar::all(255));
index d91ce65..e4b8c0b 100644 (file)
@@ -40,10 +40,10 @@ INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest,
             Values(cv::compile_args(CORE_FLUID))));
 
 INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
-    Combine(Values(AbsExact().to_compare_f()),
+    Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 0).to_compare_f()),
             Values(szSmall128, szVGA, sz720p, sz1080p),
-            Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1),
-            Values(-1, CV_8U, CV_32F),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
             Values(cv::compile_args(CORE_FLUID))));
 
 INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest,
index 866381f..c5cfc19 100644 (file)
@@ -378,141 +378,11 @@ CV_ALWAYS_INLINE int absdiff_simd(const T in1[], const T in2[], T out[], int len
 
     return 0;
 }
-
-template<typename T, typename VT>
-CV_ALWAYS_INLINE int sub_simd_sametype(const T in1[], const T in2[], T out[], int length)
-{
-    constexpr int nlanes = static_cast<int>(VT::nlanes);
-
-    if (length < nlanes)
-        return 0;
-
-    int x = 0;
-    for (;;)
-    {
-        for (; x <= length - nlanes; x += nlanes)
-        {
-            VT a = vx_load(&in1[x]);
-            VT b = vx_load(&in2[x]);
-            vx_store(&out[x], a - b);
-        }
-
-        if (x < length && (in1 != out) && (in2 != out))
-        {
-            x = length - nlanes;
-            continue;  // process one more time (unaligned tail)
-        }
-        break;
-    }
-
-    return x;
-}
-
-template<typename SRC, typename DST>
-CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int length)
-{
-    if (std::is_same<DST, float>::value && !std::is_same<SRC, float>::value)
-        return 0;
-
-    if (std::is_same<DST, SRC>::value)
-    {
-        if (std::is_same<DST, uchar>::value)
-        {
-            return sub_simd_sametype<uchar, v_uint8>(reinterpret_cast<const uchar*>(in1),
-                                                     reinterpret_cast<const uchar*>(in2),
-                                                     reinterpret_cast<uchar*>(out), length);
-        }
-        else if (std::is_same<DST, short>::value)
-        {
-            return sub_simd_sametype<short, v_int16>(reinterpret_cast<const short*>(in1),
-                                                     reinterpret_cast<const short*>(in2),
-                                                     reinterpret_cast<short*>(out), length);
-        }
-        else if (std::is_same<DST, float>::value)
-        {
-            return sub_simd_sametype<float, v_float32>(reinterpret_cast<const float*>(in1),
-                                                       reinterpret_cast<const float*>(in2),
-                                                       reinterpret_cast<float*>(out), length);
-        }
-    }
-    else if (std::is_same<SRC, short>::value && std::is_same<DST, uchar>::value)
-    {
-        constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-
-        if (length < nlanes)
-            return 0;
-
-        int x = 0;
-        for (;;)
-        {
-            for (; x <= length - nlanes; x += nlanes)
-            {
-                v_int16 a1 = vx_load(reinterpret_cast<const short*>(&in1[x]));
-                v_int16 a2 = vx_load(reinterpret_cast<const short*>(&in1[x + nlanes / 2]));
-                v_int16 b1 = vx_load(reinterpret_cast<const short*>(&in2[x]));
-                v_int16 b2 = vx_load(reinterpret_cast<const short*>(&in2[x + nlanes / 2]));
-
-                vx_store(reinterpret_cast<uchar*>(&out[x]), v_pack_u(a1 - b1, a2 - b2));
-            }
-
-            if (x < length)
-            {
-                CV_DbgAssert((reinterpret_cast<const short*>(in1) != reinterpret_cast<const short*>(out)) &&
-                             (reinterpret_cast<const short*>(in2) != reinterpret_cast<const short*>(out)));
-                x = length - nlanes;
-                continue;  // process one more time (unaligned tail)
-            }
-            break;
-        }
-
-        return x;
-    }
-    else if (std::is_same<SRC, float>::value && std::is_same<DST, uchar>::value)
-    {
-        constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-
-        if (length < nlanes)
-            return 0;
-
-        int x = 0;
-        for (;;)
-        {
-            for (; x <= length - nlanes; x += nlanes)
-            {
-                v_float32 a1 = vx_load(reinterpret_cast<const float*>(&in1[x]));
-                v_float32 a2 = vx_load(reinterpret_cast<const float*>(&in1[x + nlanes / 4]));
-                v_float32 a3 = vx_load(reinterpret_cast<const float*>(&in1[x + 2 * nlanes / 4]));
-                v_float32 a4 = vx_load(reinterpret_cast<const float*>(&in1[x + 3 * nlanes / 4]));
-
-                v_float32 b1 = vx_load(reinterpret_cast<const float*>(&in2[x]));
-                v_float32 b2 = vx_load(reinterpret_cast<const float*>(&in2[x + nlanes / 4]));
-                v_float32 b3 = vx_load(reinterpret_cast<const float*>(&in2[x + 2 * nlanes / 4]));
-                v_float32 b4 = vx_load(reinterpret_cast<const float*>(&in2[x + 3 * nlanes / 4]));
-
-                vx_store(reinterpret_cast<uchar*>(&out[x]), v_pack_u(v_pack(v_round(a1 - b1), v_round(a2 - b2)),
-                                                                     v_pack(v_round(a3 - b3), v_round(a4 - b4))));
-            }
-
-            if (x < length)
-            {
-                CV_DbgAssert((reinterpret_cast<const float*>(in1) != reinterpret_cast<const float*>(out)) &&
-                             (reinterpret_cast<const float*>(in2) != reinterpret_cast<const float*>(out)));
-                x = length - nlanes;
-                continue;  // process one more time (unaligned tail)
-            }
-            break;
-        }
-
-        return x;
-    }
-
-    return 0;
-}
 #endif // CV_SIMD
 
 template<typename DST, typename SRC1, typename SRC2>
 CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2,
-                                        Arithm arithm, double scale=1)
+                                 Arithm arithm, double scale=1)
 {
     static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
 
@@ -607,10 +477,19 @@ GAPI_FLUID_KERNEL(GFluidSub, cv::gapi::core::GSub, false)
     {
         //      DST     SRC1    SRC2    OP          __VA_ARGS__
         BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
-        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
-        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
-        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
-        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(uchar,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(uchar,  short,  short,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(uchar,  float,  float,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(short,  short,  short,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(short,  uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(short,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(short,  float,  float,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(ushort, uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(ushort, short,  short,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(ushort, float,  float,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(float,  uchar,  uchar,  run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(float,  ushort, ushort, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
         BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
         BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
 
index d80a6b2..c235991 100644 (file)
@@ -317,6 +317,33 @@ ADD_SIMD(float, float)
 
 #undef ADD_SIMD
 
+#define SUB_SIMD(SRC, DST)                                                    \
+int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length)   \
+{                                                                             \
+                                                                              \
+        CV_CPU_DISPATCH(sub_simd, (in1, in2, out, length),                    \
+                        CV_CPU_DISPATCH_MODES_ALL);                           \
+}
+
+SUB_SIMD(uchar, uchar)
+SUB_SIMD(ushort, uchar)
+SUB_SIMD(short, uchar)
+SUB_SIMD(float, uchar)
+SUB_SIMD(short, short)
+SUB_SIMD(ushort, short)
+SUB_SIMD(uchar, short)
+SUB_SIMD(float, short)
+SUB_SIMD(ushort, ushort)
+SUB_SIMD(uchar, ushort)
+SUB_SIMD(short, ushort)
+SUB_SIMD(float, ushort)
+SUB_SIMD(uchar, float)
+SUB_SIMD(ushort, float)
+SUB_SIMD(short, float)
+SUB_SIMD(float, float)
+
+#undef SUB_SIMD
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
index 052adbe..3a5d70a 100644 (file)
@@ -244,6 +244,28 @@ ADD_SIMD(float, float)
 
 #undef ADD_SIMD
 
+#define SUB_SIMD(SRC, DST)                                                     \
+int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length);
+
+SUB_SIMD(uchar, uchar)
+SUB_SIMD(ushort, uchar)
+SUB_SIMD(short, uchar)
+SUB_SIMD(float, uchar)
+SUB_SIMD(short, short)
+SUB_SIMD(ushort, short)
+SUB_SIMD(uchar, short)
+SUB_SIMD(float, short)
+SUB_SIMD(ushort, ushort)
+SUB_SIMD(uchar, ushort)
+SUB_SIMD(short, ushort)
+SUB_SIMD(float, ushort)
+SUB_SIMD(uchar, float)
+SUB_SIMD(ushort, float)
+SUB_SIMD(short, float)
+SUB_SIMD(float, float)
+
+#undef SUB_SIMD
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
index 4c324da..c148f81 100644 (file)
@@ -253,6 +253,28 @@ ADD_SIMD(float, float)
 
 #undef ADD_SIMD
 
+#define SUB_SIMD(SRC, DST)                                                      \
+int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length);
+
+SUB_SIMD(uchar, uchar)
+SUB_SIMD(ushort, uchar)
+SUB_SIMD(short, uchar)
+SUB_SIMD(float, uchar)
+SUB_SIMD(short, short)
+SUB_SIMD(ushort, short)
+SUB_SIMD(uchar, short)
+SUB_SIMD(float, short)
+SUB_SIMD(ushort, ushort)
+SUB_SIMD(uchar, ushort)
+SUB_SIMD(short, ushort)
+SUB_SIMD(float, ushort)
+SUB_SIMD(uchar, float)
+SUB_SIMD(ushort, float)
+SUB_SIMD(short, float)
+SUB_SIMD(float, float)
+
+#undef SUB_SIMD
+
 int split3_simd(const uchar in[], uchar out1[], uchar out2[],
                 uchar out3[], const int width);
 
@@ -2530,32 +2552,43 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
 // Fluid kernels: Add
 //
 //-------------------------
+template<typename VT>
+CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b)
+{
+    return a + b;
+}
 
-CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_uint16& c1, const v_uint16& c2)
+template<typename VT>
+CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b)
+{
+    return a - b;
+}
+
+CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2)
 {
     vx_store(outx, v_pack(c1, c2));
 }
 
-CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_int16& c1, const v_int16& c2)
+CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_int16& c1, const v_int16& c2)
 {
     vx_store(outx, v_pack_u(c1, c2));
 }
 
-template<typename SRC, typename DST>
+template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<SRC, DST>::value, void>::type
-add_simd_impl(const SRC* in1x, const SRC* in2x, DST* outx)
+arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, DST* outx)
 {
     vector_type_of_t<SRC> a = vx_load(in1x);
     vector_type_of_t<SRC> b = vx_load(in2x);
-    vx_store(outx, a + b);
+    vx_store(outx, oper(op, a, b));
 }
 
-template<typename SRC>
+template<typename oper_tag, typename SRC>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, void>::type
-add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx)
+arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx)
 {
     constexpr int nlanes = v_uint8::nlanes;
 
@@ -2564,10 +2597,12 @@ add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx)
     vector_type_of_t<SRC> b1 = vx_load(in2x);
     vector_type_of_t<SRC> b2 = vx_load(&in2x[nlanes / 2]);
 
-    add_uchar_store(outx, a1 + b1, a2 + b2);
+    pack_store_uchar(outx, oper(op, a1, b1), oper(op, a2, b2));
 }
 
-CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar* outx)
+template<typename oper_tag>
+CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x,
+                                         const float* in2x, uchar* outx)
 {
     constexpr int nlanes = v_uint8::nlanes;
 
@@ -2581,31 +2616,35 @@ CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar*
     v_float32 b3 = vx_load(&in2x[2 * nlanes / 4]);
     v_float32 b4 = vx_load(&in2x[3 * nlanes / 4]);
 
-    vx_store(outx, v_pack_u(v_pack(v_round(a1 + b1), v_round(a2 + b2)),
-                            v_pack(v_round(a3 + b3), v_round(a4 + b4))));
+    vx_store(outx, v_pack_u(v_pack(v_round(oper(op, a1, b1)), v_round(oper(op, a2, b2))),
+                            v_pack(v_round(oper(op, a3, b3)), v_round(oper(op, a4, b4)))));
 }
 
-CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, short* outx)
+template<typename oper_tag>
+CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const uchar* in1x,
+                                         const uchar* in2x, short* outx)
 {
     v_int16 a = v_reinterpret_as_s16(vx_load_expand(in1x));
     v_int16 b = v_reinterpret_as_s16(vx_load_expand(in2x));
 
-    vx_store(outx, a + b);
+    vx_store(outx, oper(op, a, b));
 }
 
-CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, ushort* outx)
+template<typename oper_tag>
+CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const uchar* in1x,
+                                         const uchar* in2x, ushort* outx)
 {
     v_uint16 a = vx_load_expand(in1x);
     v_uint16 b = vx_load_expand(in2x);
 
-    vx_store(outx, a + b);
+    vx_store(outx, oper(op, a, b));
 }
 
-template<typename DST>
+template<typename oper_tag, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, void>::type
-add_simd_impl(const float* in1x, const float* in2x, DST* outx)
+arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx)
 {
     constexpr int nlanes = vector_type_of_t<DST>::nlanes;
     v_float32 a1 = vx_load(in1x);
@@ -2613,10 +2652,12 @@ add_simd_impl(const float* in1x, const float* in2x, DST* outx)
     v_float32 b1 = vx_load(in2x);
     v_float32 b2 = vx_load(&in2x[nlanes/2]);
 
-    v_store_i16(outx, v_round(a1 + b1), v_round(a2 + b2));
+    v_store_i16(outx, v_round(oper(op, a1, b1)), v_round(oper(op, a2, b2)));
 }
 
-CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort* outx)
+template<typename oper_tag>
+CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const short* in1x,
+                                         const short* in2x, ushort* outx)
 {
     v_int16 a = vx_load(in1x);
     v_int32 a1 = v_expand_low(a);
@@ -2626,57 +2667,66 @@ CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort
     v_int32 b1 = v_expand_low(b);
     v_int32 b2 = v_expand_high(b);
 
-    vx_store(outx, v_pack_u(a1 + b1, a2 + b2));
+    vx_store(outx, v_pack_u(oper(op, a1, b1), oper(op, a2, b2)));
 }
 
-CV_ALWAYS_INLINE void add_simd_impl(const ushort* in1x, const ushort* in2x, short* outx)
+template<typename oper_tag>
+CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const ushort* in1x,
+                                         const ushort* in2x, short* outx)
 {
-    v_uint16 a = vx_load(in1x);
-    v_uint32 a1 = v_expand_low(a);
-    v_uint32 a2 = v_expand_high(a);
+    v_int16 a = v_reinterpret_as_s16(vx_load(in1x));
+    v_int32 a1 = v_expand_low(a);
+    v_int32 a2 = v_expand_high(a);
 
-    v_uint16 b = vx_load(in2x);
-    v_uint32 b1 = v_expand_low(b);
-    v_uint32 b2 = v_expand_high(b);
+    v_int16 b = v_reinterpret_as_s16(vx_load(in2x));
+    v_int32 b1 = v_expand_low(b);
+    v_int32 b2 = v_expand_high(b);
 
-    vx_store(outx, v_reinterpret_as_s16(v_pack(a1 + b1, a2 + b2)));
+    vx_store(outx, v_pack(oper(op, a1, b1), oper(op, a2, b2)));
 }
 
-template<typename SRC>
-CV_ALWAYS_INLINE void add_simd_impl(const SRC* in1x, const SRC* in2x, float* outx)
+template<typename oper_tag, typename SRC>
+CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, float* outx)
 {
     v_float32 a = vg_load_f32(in1x);
     v_float32 b = vg_load_f32(in2x);
 
-    vx_store(outx, a + b);
+    vx_store(outx, oper(op, a, b));
+}
+
+template<typename oper_tag, typename SRC, typename DST>
+CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[],
+                                   DST out[], const int length)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            arithmOp_simd_impl(op, &in1[x], &in2[x], &out[x]);
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;
+        }
+        break;
+    }
+
+    return x;
 }
 
 #define ADD_SIMD(SRC, DST)                                                      \
 int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length)     \
 {                                                                               \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;                       \
-                                                                                \
-    if (length < nlanes)                                                        \
-        return 0;                                                               \
-                                                                                \
-    int x = 0;                                                                  \
-    for (;;)                                                                    \
-    {                                                                           \
-        for (; x <= length - nlanes; x += nlanes)                               \
-        {                                                                       \
-            add_simd_impl(&in1[x], &in2[x], &out[x]);                           \
-        }                                                                       \
-                                                                                \
-        if (x < length)                                                         \
-        {                                                                       \
-            x = length - nlanes;                                                \
-            continue;                                                           \
-        }                                                                       \
-        break;                                                                  \
-    }                                                                           \
-                                                                                \
-    return x;                                                                   \
-}
+    return arithmOp_simd(add_tag{}, in1, in2, out, length);                     \
+}                                                                               \
 
 ADD_SIMD(uchar, uchar)
 ADD_SIMD(ushort, uchar)
@@ -2697,6 +2747,37 @@ ADD_SIMD(float, float)
 
 #undef ADD_SIMD
 
+//-------------------------
+//
+// Fluid kernels: Sub
+//
+//-------------------------
+
+#define SUB_SIMD(SRC, DST)                                                      \
+int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length)     \
+{                                                                               \
+    return arithmOp_simd(sub_tag{}, in1, in2, out, length);                     \
+}                                                                               \
+
+SUB_SIMD(uchar, uchar)
+SUB_SIMD(ushort, uchar)
+SUB_SIMD(short, uchar)
+SUB_SIMD(float, uchar)
+SUB_SIMD(short, short)
+SUB_SIMD(ushort, short)
+SUB_SIMD(uchar, short)
+SUB_SIMD(float, short)
+SUB_SIMD(ushort, ushort)
+SUB_SIMD(uchar, ushort)
+SUB_SIMD(short, ushort)
+SUB_SIMD(float, ushort)
+SUB_SIMD(uchar, float)
+SUB_SIMD(ushort, float)
+SUB_SIMD(short, float)
+SUB_SIMD(float, float)
+
+#undef SUB_SIMD
+
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END