Merge pull request #13319 from elatkin:el/gapi_perf_erdilate_2
authorEvgeny Latkin <evgeny.latkin@intel.com>
Wed, 28 Nov 2018 16:50:39 +0000 (19:50 +0300)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Wed, 28 Nov 2018 16:50:39 +0000 (19:50 +0300)
GAPI (fluid): Erode/Dilate optimization, part 2 (#13319)

* GAPI (fluid): Erode/Dilate optimization: hard-code 3x3 case

* GAPI (fluid): Erode/Dilate optimization: CPU dispatcher

* GAPI (fluid): Erode/Dilate optimization: speed-up 10-15x times with CV_SIMD

* GAPI (fluid): Erode/Dilate optimization: 20-30% speed-up

modules/gapi/src/backends/fluid/gfluidimgproc.cpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

index 27e3562..df44528 100644 (file)
@@ -1164,12 +1164,34 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
 //
 //-----------------------------
 
+static MorphShape detect_morph3x3_shape(const uchar kernel[])
+{
+    const uchar k[3][3] = {
+        { kernel[0], kernel[1], kernel[2]},
+        { kernel[3], kernel[4], kernel[5]},
+        { kernel[6], kernel[7], kernel[8]}
+    };
+
+    if (k[0][0] && k[0][1] && k[0][2] &&
+        k[1][0] && k[1][1] && k[1][2] &&
+        k[2][0] && k[2][1] && k[2][2])
+        return M_FULL;
+
+    if (!k[0][0] && k[0][1] && !k[0][2] &&
+         k[1][0] && k[1][1] &&  k[1][2] &&
+        !k[2][0] && k[2][1] && !k[2][2])
+        return M_CROSS;
+
+    return M_UNDEF;
+}
+
 template<typename DST, typename SRC>
 static void run_morphology(          Buffer&    dst,
                            const     View  &    src,
                            const     uchar      k[],
                                      int        k_rows,
                                      int        k_cols,
+                                     MorphShape k_type,
                            const cv::Point & /* anchor */,
                                      Morphology morphology)
 {
@@ -1199,7 +1221,7 @@ static void run_morphology(          Buffer&    dst,
     // call optimized code, if 3x3
     if (3 == k_rows && 3 == k_cols)
     {
-        run_morphology3x3_impl(out, in, width, chan, k, morphology);
+        run_morphology3x3_impl(out, in, width, chan, k, k_type, morphology);
         return;
     }
 
@@ -1261,14 +1283,16 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
 
         int k_rows = kernel.rows;
         int k_cols = kernel.cols;
+        int k_size = k_rows * k_cols;
 
         auto *k = scratch.OutLine<uchar>(); // copy of kernel.data
+        auto k_type = static_cast<MorphShape>(k[k_size]);
 
         //     DST     SRC     OP              __VA_ARGS__
-        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
-        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
-        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
-        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
+        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE);
+        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE);
+        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE);
+        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_ERODE);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
@@ -1283,8 +1307,9 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
     {
         int k_rows = kernel.rows;
         int k_cols = kernel.cols;
+        int k_size = k_rows * k_cols;
 
-        cv::gapi::own::Size bufsize(k_rows * k_cols, 1);
+        cv::gapi::own::Size bufsize(k_size + 1, 1);
         GMatDesc bufdesc = {CV_8U, 1, bufsize};
         Buffer buffer(bufdesc);
         scratch = std::move(buffer);
@@ -1292,6 +1317,11 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
         // FIXME: move to resetScratch stage ?
         auto *k = scratch.OutLine<uchar>();
         getKernel(k, kernel);
+
+        if (3 == k_rows && 3 == k_cols)
+            k[k_size] = static_cast<uchar>(detect_morph3x3_shape(k));
+        else
+            k[k_size] = static_cast<uchar>(M_UNDEF);
     }
 
     static void resetScratch(Buffer& /* scratch */)
@@ -1339,14 +1369,16 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
 
         int k_rows = kernel.rows;
         int k_cols = kernel.cols;
+        int k_size = k_rows * k_cols;
 
         auto *k = scratch.OutLine<uchar>(); // copy of kernel.data
+        auto k_type = static_cast<MorphShape>(k[k_size]);
 
         //     DST     SRC     OP              __VA_ARGS__
-        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
-        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
-        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
-        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
+        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE);
+        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE);
+        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE);
+        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, k_type, anchor, M_DILATE);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
@@ -1361,8 +1393,9 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
     {
         int k_rows = kernel.rows;
         int k_cols = kernel.cols;
+        int k_size = k_rows * k_cols;
 
-        cv::gapi::own::Size bufsize(k_rows * k_cols, 1);
+        cv::gapi::own::Size bufsize(k_size + 1, 1);
         GMatDesc bufdesc = {CV_8U, 1, bufsize};
         Buffer buffer(bufdesc);
         scratch = std::move(buffer);
@@ -1370,6 +1403,11 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
         // FIXME: move to resetScratch stage ?
         auto *k = scratch.OutLine<uchar>();
         getKernel(k, kernel);
+
+        if (3 == k_rows && 3 == k_cols)
+            k[k_size] = static_cast<uchar>(detect_morph3x3_shape(k));
+        else
+            k[k_size] = static_cast<uchar>(M_UNDEF);
     }
 
     static void resetScratch(Buffer& /* scratch */)
index e9eebfa..ccebc3f 100644 (file)
@@ -119,10 +119,11 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 #define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
-                            const uchar k[], Morphology morphology)      \
+                            const uchar k[], MorphShape k_type,          \
+                            Morphology morphology)                       \
 {                                                                        \
     CV_CPU_DISPATCH(run_morphology3x3_impl,                              \
-        (out, in, width, chan, k, morphology),                           \
+        (out, in, width, chan, k, k_type, morphology),                   \
         CV_CPU_DISPATCH_MODES_ALL);                                      \
 }
 
index 6116c4b..0fd8b65 100644 (file)
@@ -85,9 +85,12 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 enum Morphology { M_ERODE, M_DILATE };
 
+enum MorphShape { M_FULL, M_CROSS, M_UNDEF };
+
 #define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
-                            const uchar k[], Morphology morphology);
+                            const uchar k[], MorphShape k_type,          \
+                            Morphology morphology);
 
 RUN_MORPHOLOGY3X3_IMPL(uchar )
 RUN_MORPHOLOGY3X3_IMPL(ushort)
index cdd5e82..79b474e 100644 (file)
@@ -107,7 +107,8 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 #define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
-                            const uchar k[], Morphology morphology);
+                            const uchar k[], MorphShape k_type,          \
+                            Morphology morphology);
 
 RUN_MORPHOLOGY3X3_IMPL(uchar )
 RUN_MORPHOLOGY3X3_IMPL(ushort)
@@ -1124,9 +1125,10 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 //
 //-----------------------------
 
-template<Morphology morphology, typename T>
+template<typename T>
 static void run_morphology3x3_reference(T out[], const T *in[], int width, int chan,
-                                        const uchar k[])
+                                        const uchar k[], MorphShape k_type,
+                                        Morphology morphology)
 {
     constexpr int k_size = 3;
     constexpr int border = (k_size - 1) / 2;
@@ -1136,13 +1138,58 @@ static void run_morphology3x3_reference(T out[], const T *in[], int width, int c
     const int length = width * chan;
     const int shift = border * chan;
 
-    for (int l=0; l < length; l++)
+    if (M_ERODE == morphology)
     {
-        T result = M_ERODE == morphology? std::numeric_limits<T>::max():
-                                          std::numeric_limits<T>::min();
+        if (M_FULL == k_type)
+        {
+            for (int l=0; l < length; l++)
+            {
+                T result = std::numeric_limits<T>::max();
+
+                result = (std::min)(result, in[0][l - shift]);
+                result = (std::min)(result, in[0][l        ]);
+                result = (std::min)(result, in[0][l + shift]);
+
+                result = (std::min)(result, in[1][l - shift]);
+                result = (std::min)(result, in[1][l        ]);
+                result = (std::min)(result, in[1][l + shift]);
+
+                result = (std::min)(result, in[2][l - shift]);
+                result = (std::min)(result, in[2][l        ]);
+                result = (std::min)(result, in[2][l + shift]);
+
+                out[l] = result;
+            }
+            return;
+        }
+
+        if (M_CROSS == k_type)
+        {
+            for (int l=0; l < length; l++)
+            {
+                T result = std::numeric_limits<T>::max();
+
+            //  result = (std::min)(result, in[0][l - shift]);
+                result = (std::min)(result, in[0][l        ]);
+            //  result = (std::min)(result, in[0][l + shift]);
+
+                result = (std::min)(result, in[1][l - shift]);
+                result = (std::min)(result, in[1][l        ]);
+                result = (std::min)(result, in[1][l + shift]);
 
-        if (M_ERODE == morphology)
+            //  result = (std::min)(result, in[2][l - shift]);
+                result = (std::min)(result, in[2][l        ]);
+            //  result = (std::min)(result, in[2][l + shift]);
+
+                out[l] = result;
+            }
+            return;
+        }
+
+        for (int l=0; l < length; l++)
         {
+            T result = std::numeric_limits<T>::max();
+
             result = kernel[0][0]? (std::min)(result, in[0][l - shift]): result;
             result = kernel[0][1]? (std::min)(result, in[0][l        ]): result;
             result = kernel[0][2]? (std::min)(result, in[0][l + shift]): result;
@@ -1154,9 +1201,64 @@ static void run_morphology3x3_reference(T out[], const T *in[], int width, int c
             result = kernel[2][0]? (std::min)(result, in[2][l - shift]): result;
             result = kernel[2][1]? (std::min)(result, in[2][l        ]): result;
             result = kernel[2][2]? (std::min)(result, in[2][l + shift]): result;
+
+            out[l] = result;
         }
-        else // if (M_DILATE == morphology)
+        return;
+    }
+
+    if (M_DILATE == morphology)
+    {
+        if (M_FULL == k_type)
         {
+            for (int l=0; l < length; l++)
+            {
+                T result = std::numeric_limits<T>::min();
+
+                result = (std::max)(result, in[0][l - shift]);
+                result = (std::max)(result, in[0][l        ]);
+                result = (std::max)(result, in[0][l + shift]);
+
+                result = (std::max)(result, in[1][l - shift]);
+                result = (std::max)(result, in[1][l        ]);
+                result = (std::max)(result, in[1][l + shift]);
+
+                result = (std::max)(result, in[2][l - shift]);
+                result = (std::max)(result, in[2][l        ]);
+                result = (std::max)(result, in[2][l + shift]);
+
+                out[l] = result;
+            }
+            return;
+        }
+
+        if (M_CROSS == k_type)
+        {
+            for (int l=0; l < length; l++)
+            {
+                T result = std::numeric_limits<T>::min();
+
+            //  result = (std::max)(result, in[0][l - shift]);
+                result = (std::max)(result, in[0][l        ]);
+            //  result = (std::max)(result, in[0][l + shift]);
+
+                result = (std::max)(result, in[1][l - shift]);
+                result = (std::max)(result, in[1][l        ]);
+                result = (std::max)(result, in[1][l + shift]);
+
+            //  result = (std::max)(result, in[2][l - shift]);
+                result = (std::max)(result, in[2][l        ]);
+            //  result = (std::max)(result, in[2][l + shift]);
+
+                out[l] = result;
+            }
+            return;
+        }
+
+        for (int l=0; l < length; l++)
+        {
+            T result = std::numeric_limits<T>::min();
+
             result = kernel[0][0]? (std::max)(result, in[0][l - shift]): result;
             result = kernel[0][1]? (std::max)(result, in[0][l        ]): result;
             result = kernel[0][2]? (std::max)(result, in[0][l + shift]): result;
@@ -1168,16 +1270,21 @@ static void run_morphology3x3_reference(T out[], const T *in[], int width, int c
             result = kernel[2][0]? (std::max)(result, in[2][l - shift]): result;
             result = kernel[2][1]? (std::max)(result, in[2][l        ]): result;
             result = kernel[2][2]? (std::max)(result, in[2][l + shift]): result;
-        }
 
-        out[l] = result;
+            out[l] = result;
+        }
+        return;
     }
+
+    CV_Error(cv::Error::StsBadArg, "unsupported morphology");
 }
 
 #if CV_SIMD
-template<Morphology morphology, typename T, typename VT, typename S>
+template<typename T, typename VT, typename S>
 static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
-                                   const uchar k[], S setall)
+                                   const uchar k[], MorphShape k_type,
+                                   Morphology morphology,
+                                   S setall)
 {
     constexpr int k_size = 3;
     constexpr int border = (k_size - 1) / 2;
@@ -1187,18 +1294,89 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
     const int length = width * chan;
     const int shift = border * chan;
 
-    for (int l=0; l < length;)
+    if (M_ERODE == morphology)
     {
-        constexpr int nlanes = VT::nlanes;
+        if (M_FULL == k_type)
+        {
+            for (int l=0; l < length;)
+            {
+                constexpr int nlanes = VT::nlanes;
+
+                // main part of output row
+                for (; l <= length - nlanes; l += nlanes)
+                {
+                    VT r = setall(std::numeric_limits<T>::max());
+
+                    r = v_min(r, vx_load(&in[0][l - shift]));
+                    r = v_min(r, vx_load(&in[0][l        ]));
+                    r = v_min(r, vx_load(&in[0][l + shift]));
+
+                    r = v_min(r, vx_load(&in[1][l - shift]));
+                    r = v_min(r, vx_load(&in[1][l        ]));
+                    r = v_min(r, vx_load(&in[1][l + shift]));
+
+                    r = v_min(r, vx_load(&in[2][l - shift]));
+                    r = v_min(r, vx_load(&in[2][l        ]));
+                    r = v_min(r, vx_load(&in[2][l + shift]));
+
+                    v_store(&out[l], r);
+                }
+
+                // tail (if any)
+                if (l < length)
+                {
+                    GAPI_DbgAssert(length >= nlanes);
+                    l = length - nlanes;
+                }
+            }
+            return;
+        }
 
-        // main part of output row
-        for (; l <= length - nlanes; l += nlanes)
+        if (M_CROSS == k_type)
+        {
+            for (int l=0; l < length;)
+            {
+                constexpr int nlanes = VT::nlanes;
+
+                // main part of output row
+                for (; l <= length - nlanes; l += nlanes)
+                {
+                    VT r = setall(std::numeric_limits<T>::max());
+
+                //  r = v_min(r, vx_load(&in[0][l - shift]));
+                    r = v_min(r, vx_load(&in[0][l        ]));
+                //  r = v_min(r, vx_load(&in[0][l + shift]));
+
+                    r = v_min(r, vx_load(&in[1][l - shift]));
+                    r = v_min(r, vx_load(&in[1][l        ]));
+                    r = v_min(r, vx_load(&in[1][l + shift]));
+
+                //  r = v_min(r, vx_load(&in[2][l - shift]));
+                    r = v_min(r, vx_load(&in[2][l        ]));
+                //  r = v_min(r, vx_load(&in[2][l + shift]));
+
+                    v_store(&out[l], r);
+                }
+
+                // tail (if any)
+                if (l < length)
+                {
+                    GAPI_DbgAssert(length >= nlanes);
+                    l = length - nlanes;
+                }
+            }
+            return;
+        }
+
+        for (int l=0; l < length;)
         {
-            VT r = M_ERODE == morphology? setall(std::numeric_limits<T>::max()):
-                                          setall(std::numeric_limits<T>::min());
+            constexpr int nlanes = VT::nlanes;
 
-            if (M_ERODE == morphology)
+            // main part of output row
+            for (; l <= length - nlanes; l += nlanes)
             {
+                VT r = setall(std::numeric_limits<T>::max());
+
                 if (kernel[0][0]) r = v_min(r, vx_load(&in[0][l - shift]));
                 if (kernel[0][1]) r = v_min(r, vx_load(&in[0][l        ]));
                 if (kernel[0][2]) r = v_min(r, vx_load(&in[0][l + shift]));
@@ -1210,9 +1388,103 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
                 if (kernel[2][0]) r = v_min(r, vx_load(&in[2][l - shift]));
                 if (kernel[2][1]) r = v_min(r, vx_load(&in[2][l        ]));
                 if (kernel[2][2]) r = v_min(r, vx_load(&in[2][l + shift]));
+
+                v_store(&out[l], r);
+            }
+
+            // tail (if any)
+            if (l < length)
+            {
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
             }
-            else // if (M_DILATE == morphology)
+        }
+        return;
+    }
+
+    if (M_DILATE == morphology)
+    {
+        if (M_FULL == k_type)
+        {
+            for (int l=0; l < length;)
             {
+                constexpr int nlanes = VT::nlanes;
+
+                // main part of output row
+                for (; l <= length - nlanes; l += nlanes)
+                {
+                    VT r = setall(std::numeric_limits<T>::min());
+
+                    r = v_max(r, vx_load(&in[0][l - shift]));
+                    r = v_max(r, vx_load(&in[0][l        ]));
+                    r = v_max(r, vx_load(&in[0][l + shift]));
+
+                    r = v_max(r, vx_load(&in[1][l - shift]));
+                    r = v_max(r, vx_load(&in[1][l        ]));
+                    r = v_max(r, vx_load(&in[1][l + shift]));
+
+                    r = v_max(r, vx_load(&in[2][l - shift]));
+                    r = v_max(r, vx_load(&in[2][l        ]));
+                    r = v_max(r, vx_load(&in[2][l + shift]));
+
+                    v_store(&out[l], r);
+                }
+
+                // tail (if any)
+                if (l < length)
+                {
+                    GAPI_DbgAssert(length >= nlanes);
+                    l = length - nlanes;
+                }
+            }
+            return;
+        }
+
+        if (M_CROSS == k_type)
+        {
+            for (int l=0; l < length;)
+            {
+                constexpr int nlanes = VT::nlanes;
+
+                // main part of output row
+                for (; l <= length - nlanes; l += nlanes)
+                {
+                    VT r = setall(std::numeric_limits<T>::min());
+
+                //  r = v_max(r, vx_load(&in[0][l - shift]));
+                    r = v_max(r, vx_load(&in[0][l        ]));
+                //  r = v_max(r, vx_load(&in[0][l + shift]));
+
+                    r = v_max(r, vx_load(&in[1][l - shift]));
+                    r = v_max(r, vx_load(&in[1][l        ]));
+                    r = v_max(r, vx_load(&in[1][l + shift]));
+
+                //  r = v_max(r, vx_load(&in[2][l - shift]));
+                    r = v_max(r, vx_load(&in[2][l        ]));
+                //  r = v_max(r, vx_load(&in[2][l + shift]));
+
+                    v_store(&out[l], r);
+                }
+
+                // tail (if any)
+                if (l < length)
+                {
+                    GAPI_DbgAssert(length >= nlanes);
+                    l = length - nlanes;
+                }
+            }
+            return;
+        }
+
+        for (int l=0; l < length;)
+        {
+            constexpr int nlanes = VT::nlanes;
+
+            // main part of output row
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                VT r = setall(std::numeric_limits<T>::min());
+
                 if (kernel[0][0]) r = v_max(r, vx_load(&in[0][l - shift]));
                 if (kernel[0][1]) r = v_max(r, vx_load(&in[0][l        ]));
                 if (kernel[0][2]) r = v_max(r, vx_load(&in[0][l + shift]));
@@ -1224,24 +1496,28 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
                 if (kernel[2][0]) r = v_max(r, vx_load(&in[2][l - shift]));
                 if (kernel[2][1]) r = v_max(r, vx_load(&in[2][l        ]));
                 if (kernel[2][2]) r = v_max(r, vx_load(&in[2][l + shift]));
-            }
 
-            v_store(&out[l], r);
-        }
+                v_store(&out[l], r);
+            }
 
-        // tail (if any)
-        if (l < length)
-        {
-            GAPI_DbgAssert(length >= nlanes);
-            l = length - nlanes;
+            // tail (if any)
+            if (l < length)
+            {
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
         }
+        return;
     }
+
+    CV_Error(cv::Error::StsBadArg, "unsupported morphology");
 }
 #endif
 
-template<Morphology morphology, typename T>
+template<typename T>
 static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
-                                   const uchar k[])
+                                   const uchar k[], MorphShape k_type,
+                                   Morphology morphology)
 {
 #if CV_SIMD
     int length = width * chan;
@@ -1251,54 +1527,50 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
 
     if (std::is_same<T, float>::value && length >= v_float32::nlanes)
     {
-        run_morphology3x3_simd<morphology, float, v_float32>(reinterpret_cast<float*>(out),
-                                                             reinterpret_cast<const float**>(in),
-                                                             width, chan, k, vx_setall_f32);
+        run_morphology3x3_simd<float, v_float32>(reinterpret_cast<float*>(out),
+                                                 reinterpret_cast<const float**>(in),
+                                                 width, chan, k, k_type, morphology,
+                                                 vx_setall_f32);
         return;
     }
 
     if (std::is_same<T, short>::value && length >= v_int16::nlanes)
     {
-        run_morphology3x3_simd<morphology, short, v_int16>(reinterpret_cast<short*>(out),
-                                                           reinterpret_cast<const short**>(in),
-                                                           width, chan, k, vx_setall_s16);
+        run_morphology3x3_simd<short, v_int16>(reinterpret_cast<short*>(out),
+                                               reinterpret_cast<const short**>(in),
+                                               width, chan, k, k_type, morphology,
+                                               vx_setall_s16);
         return;
     }
 
     if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
     {
-        run_morphology3x3_simd<morphology, ushort, v_uint16>(reinterpret_cast<ushort*>(out),
-                                                             reinterpret_cast<const ushort**>(in),
-                                                             width, chan, k, vx_setall_u16);
+        run_morphology3x3_simd<ushort, v_uint16>(reinterpret_cast<ushort*>(out),
+                                                 reinterpret_cast<const ushort**>(in),
+                                                 width, chan, k, k_type, morphology,
+                                                 vx_setall_u16);
         return;
     }
 
     if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
     {
-        run_morphology3x3_simd<morphology, uchar, v_uint8>(reinterpret_cast<uchar*>(out),
-                                                           reinterpret_cast<const uchar**>(in),
-                                                           width, chan, k, vx_setall_u8);
+        run_morphology3x3_simd<uchar, v_uint8>(reinterpret_cast<uchar*>(out),
+                                               reinterpret_cast<const uchar**>(in),
+                                               width, chan, k, k_type, morphology,
+                                               vx_setall_u8);
         return;
     }
 #endif  // CV_SIMD
 
-    run_morphology3x3_reference<morphology>(out, in, width, chan, k);
+    run_morphology3x3_reference(out, in, width, chan, k, k_type, morphology);
 }
 
 #define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
 void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
-                            const uchar k[], Morphology morphology)      \
+                            const uchar k[], MorphShape k_type,          \
+                            Morphology morphology)                       \
 {                                                                        \
-    if (M_ERODE == morphology)                                           \
-    {                                                                    \
-        run_morphology3x3_code<M_ERODE>(out, in, width, chan, k);        \
-    }                                                                    \
-    else if (M_DILATE == morphology)                                     \
-    {                                                                    \
-        run_morphology3x3_code<M_DILATE>(out, in, width, chan, k);       \
-    }                                                                    \
-    else                                                                 \
-        CV_Error(cv::Error::StsBadArg, "unsupported morphology operation"); \
+    run_morphology3x3_code(out, in, width, chan, k, k_type, morphology); \
 }
 
 RUN_MORPHOLOGY3X3_IMPL(uchar )