Merge pull request #13315 from elatkin:el/gapi_perf_erdilate
authorEvgeny Latkin <evgeny.latkin@intel.com>
Wed, 28 Nov 2018 15:20:31 +0000 (18:20 +0300)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Wed, 28 Nov 2018 15:20:31 +0000 (18:20 +0300)
GAPI (fluid): Erode/Dilate optimization (#13315)

* GAPI (fluid): Erode/Dilate optimization: hard-code 3x3 case

* GAPI (fluid): Erode/Dilate optimization: CPU dispatcher

* GAPI (fluid): Erode/Dilate optimization: speed-up 10-15x times with CV_SIMD

modules/gapi/src/backends/fluid/gfluidimgproc.cpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

index 60d6636..27e3562 100644 (file)
@@ -1164,8 +1164,6 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
 //
 //-----------------------------
 
-enum Morphology { M_ERODE, M_DILATE };
-
 template<typename DST, typename SRC>
 static void run_morphology(          Buffer&    dst,
                            const     View  &    src,
@@ -1175,6 +1173,10 @@ static void run_morphology(          Buffer&    dst,
                            const cv::Point & /* anchor */,
                                      Morphology morphology)
 {
+    static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");
+
+    GAPI_Assert(M_ERODE == morphology || M_DILATE == morphology);
+
     static const int maxLines = 9;
     GAPI_Assert(k_rows <= maxLines);
 
@@ -1194,43 +1196,44 @@ static void run_morphology(          Buffer&    dst,
     int width = dst.length();
     int chan  = dst.meta().chan;
 
-    for (int w=0; w < width; w++)
+    // call optimized code, if 3x3
+    if (3 == k_rows && 3 == k_cols)
     {
-        // TODO: make this cycle innermost
-        for (int c=0; c < chan; c++)
+        run_morphology3x3_impl(out, in, width, chan, k, morphology);
+        return;
+    }
+
+    // reference: any size of k[]
+    int length = width * chan;
+    for (int l=0; l < length; l++)
+    {
+        SRC result;
+        if (M_ERODE == morphology)
         {
-            SRC result=0;
-            if (M_ERODE == morphology)
-            {
-                result = std::numeric_limits<SRC>::max();
-            }
-            else if (M_DILATE == morphology)
-            {
-                result = std::numeric_limits<SRC>::min();
-            }
-            else
-                CV_Error(cv::Error::StsBadArg, "unsupported morphology operation");
+            result = std::numeric_limits<SRC>::max();
+        }
+        else // if (M_DILATE == morphology)
+        {
+            result = std::numeric_limits<SRC>::min();
+        }
 
-            for (int i=0; i < k_rows; i++)
-            for (int j=0; j < k_cols; j++)
+        for (int i=0; i < k_rows; i++)
+        for (int j=0; j < k_cols; j++)
+        {
+            if ( k[k_cols*i + j] )
             {
-                if ( k[k_cols*i + j] )
+                if (M_ERODE == morphology)
+                {
+                    result = (std::min)(result, in[i][l + (j - border_x)*chan]);
+                }
+                else // if (M_DILATE == morphology)
                 {
-                    if (M_ERODE == morphology)
-                    {
-                        result = std::min(result, in[i][(w + j - border_x)*chan + c]);
-                    }
-                    else if (M_DILATE == morphology)
-                    {
-                        result = std::max(result, in[i][(w + j - border_x)*chan + c]);
-                    }
-                    else
-                        CV_Error(cv::Error::StsBadArg, "unsupported morphology operation");
+                    result = (std::max)(result, in[i][l + (j - border_x)*chan]);
                 }
             }
-
-            out[w*chan + c] = saturate<DST>(result, rintf);
         }
+
+        out[l] = saturate<DST>(result, rintf);
     }
 }
 
index 0227a0a..e9eebfa 100644 (file)
@@ -111,6 +111,28 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 #undef RUN_FILTER2D_3X3_IMPL
 
+//-----------------------------
+//
+// Fluid kernels: Erode, Dilate
+//
+//-----------------------------
+
+#define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
+void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
+                            const uchar k[], Morphology morphology)      \
+{                                                                        \
+    CV_CPU_DISPATCH(run_morphology3x3_impl,                              \
+        (out, in, width, chan, k, morphology),                           \
+        CV_CPU_DISPATCH_MODES_ALL);                                      \
+}
+
+RUN_MORPHOLOGY3X3_IMPL(uchar )
+RUN_MORPHOLOGY3X3_IMPL(ushort)
+RUN_MORPHOLOGY3X3_IMPL( short)
+RUN_MORPHOLOGY3X3_IMPL( float)
+
+#undef RUN_MORPHOLOGY3X3_IMPL
+
 } // namespace fliud
 } // namespace gapi
 } // namespace cv
index db5aeda..6116c4b 100644 (file)
@@ -77,6 +77,25 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 #undef RUN_FILTER2D_3X3_IMPL
 
+//-----------------------------
+//
+// Fluid kernels: Erode, Dilate
+//
+//-----------------------------
+
+enum Morphology { M_ERODE, M_DILATE };
+
+#define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
+void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
+                            const uchar k[], Morphology morphology);
+
+RUN_MORPHOLOGY3X3_IMPL(uchar )
+RUN_MORPHOLOGY3X3_IMPL(ushort)
+RUN_MORPHOLOGY3X3_IMPL( short)
+RUN_MORPHOLOGY3X3_IMPL( float)
+
+#undef RUN_MORPHOLOGY3X3_IMPL
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
index 821a0ad..cdd5e82 100644 (file)
@@ -19,6 +19,8 @@
 #include <cstdint>
 #include <cstring>
 
+#include <algorithm>
+#include <limits>
 #include <vector>
 
 #ifdef __GNUC__
@@ -97,6 +99,23 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 #undef RUN_FILTER2D_3X3_IMPL
 
+//-----------------------------
+//
+// Fluid kernels: Erode, Dilate
+//
+//-----------------------------
+
+#define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
+void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
+                            const uchar k[], Morphology morphology);
+
+RUN_MORPHOLOGY3X3_IMPL(uchar )
+RUN_MORPHOLOGY3X3_IMPL(ushort)
+RUN_MORPHOLOGY3X3_IMPL( short)
+RUN_MORPHOLOGY3X3_IMPL( float)
+
+#undef RUN_MORPHOLOGY3X3_IMPL
+
 //----------------------------------------------------------------------
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@@ -1099,6 +1118,196 @@ RUN_FILTER2D_3X3_IMPL( float,  float)
 
 #undef RUN_FILTER2D_3X3_IMPL
 
+//-----------------------------
+//
+// Fluid kernels: Erode, Dilate
+//
+//-----------------------------
+
+template<Morphology morphology, typename T>
+static void run_morphology3x3_reference(T out[], const T *in[], int width, int chan,
+                                        const uchar k[])
+{
+    constexpr int k_size = 3;
+    constexpr int border = (k_size - 1) / 2;
+
+    const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    for (int l=0; l < length; l++)
+    {
+        T result = M_ERODE == morphology? std::numeric_limits<T>::max():
+                                          std::numeric_limits<T>::min();
+
+        if (M_ERODE == morphology)
+        {
+            result = kernel[0][0]? (std::min)(result, in[0][l - shift]): result;
+            result = kernel[0][1]? (std::min)(result, in[0][l        ]): result;
+            result = kernel[0][2]? (std::min)(result, in[0][l + shift]): result;
+
+            result = kernel[1][0]? (std::min)(result, in[1][l - shift]): result;
+            result = kernel[1][1]? (std::min)(result, in[1][l        ]): result;
+            result = kernel[1][2]? (std::min)(result, in[1][l + shift]): result;
+
+            result = kernel[2][0]? (std::min)(result, in[2][l - shift]): result;
+            result = kernel[2][1]? (std::min)(result, in[2][l        ]): result;
+            result = kernel[2][2]? (std::min)(result, in[2][l + shift]): result;
+        }
+        else // if (M_DILATE == morphology)
+        {
+            result = kernel[0][0]? (std::max)(result, in[0][l - shift]): result;
+            result = kernel[0][1]? (std::max)(result, in[0][l        ]): result;
+            result = kernel[0][2]? (std::max)(result, in[0][l + shift]): result;
+
+            result = kernel[1][0]? (std::max)(result, in[1][l - shift]): result;
+            result = kernel[1][1]? (std::max)(result, in[1][l        ]): result;
+            result = kernel[1][2]? (std::max)(result, in[1][l + shift]): result;
+
+            result = kernel[2][0]? (std::max)(result, in[2][l - shift]): result;
+            result = kernel[2][1]? (std::max)(result, in[2][l        ]): result;
+            result = kernel[2][2]? (std::max)(result, in[2][l + shift]): result;
+        }
+
+        out[l] = result;
+    }
+}
+
+#if CV_SIMD
+template<Morphology morphology, typename T, typename VT, typename S>
+static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
+                                   const uchar k[], S setall)
+{
+    constexpr int k_size = 3;
+    constexpr int border = (k_size - 1) / 2;
+
+    const uchar kernel[3][3] = {{k[0], k[1], k[2]}, {k[3], k[4], k[5]}, {k[6], k[7], k[8]}};
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = VT::nlanes;
+
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            VT r = M_ERODE == morphology? setall(std::numeric_limits<T>::max()):
+                                          setall(std::numeric_limits<T>::min());
+
+            if (M_ERODE == morphology)
+            {
+                if (kernel[0][0]) r = v_min(r, vx_load(&in[0][l - shift]));
+                if (kernel[0][1]) r = v_min(r, vx_load(&in[0][l        ]));
+                if (kernel[0][2]) r = v_min(r, vx_load(&in[0][l + shift]));
+
+                if (kernel[1][0]) r = v_min(r, vx_load(&in[1][l - shift]));
+                if (kernel[1][1]) r = v_min(r, vx_load(&in[1][l        ]));
+                if (kernel[1][2]) r = v_min(r, vx_load(&in[1][l + shift]));
+
+                if (kernel[2][0]) r = v_min(r, vx_load(&in[2][l - shift]));
+                if (kernel[2][1]) r = v_min(r, vx_load(&in[2][l        ]));
+                if (kernel[2][2]) r = v_min(r, vx_load(&in[2][l + shift]));
+            }
+            else // if (M_DILATE == morphology)
+            {
+                if (kernel[0][0]) r = v_max(r, vx_load(&in[0][l - shift]));
+                if (kernel[0][1]) r = v_max(r, vx_load(&in[0][l        ]));
+                if (kernel[0][2]) r = v_max(r, vx_load(&in[0][l + shift]));
+
+                if (kernel[1][0]) r = v_max(r, vx_load(&in[1][l - shift]));
+                if (kernel[1][1]) r = v_max(r, vx_load(&in[1][l        ]));
+                if (kernel[1][2]) r = v_max(r, vx_load(&in[1][l + shift]));
+
+                if (kernel[2][0]) r = v_max(r, vx_load(&in[2][l - shift]));
+                if (kernel[2][1]) r = v_max(r, vx_load(&in[2][l        ]));
+                if (kernel[2][2]) r = v_max(r, vx_load(&in[2][l + shift]));
+            }
+
+            v_store(&out[l], r);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+#endif
+
+template<Morphology morphology, typename T>
+static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
+                                   const uchar k[])
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void) length;
+
+    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    {
+        run_morphology3x3_simd<morphology, float, v_float32>(reinterpret_cast<float*>(out),
+                                                             reinterpret_cast<const float**>(in),
+                                                             width, chan, k, vx_setall_f32);
+        return;
+    }
+
+    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    {
+        run_morphology3x3_simd<morphology, short, v_int16>(reinterpret_cast<short*>(out),
+                                                           reinterpret_cast<const short**>(in),
+                                                           width, chan, k, vx_setall_s16);
+        return;
+    }
+
+    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    {
+        run_morphology3x3_simd<morphology, ushort, v_uint16>(reinterpret_cast<ushort*>(out),
+                                                             reinterpret_cast<const ushort**>(in),
+                                                             width, chan, k, vx_setall_u16);
+        return;
+    }
+
+    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    {
+        run_morphology3x3_simd<morphology, uchar, v_uint8>(reinterpret_cast<uchar*>(out),
+                                                           reinterpret_cast<const uchar**>(in),
+                                                           width, chan, k, vx_setall_u8);
+        return;
+    }
+#endif  // CV_SIMD
+
+    run_morphology3x3_reference<morphology>(out, in, width, chan, k);
+}
+
+#define RUN_MORPHOLOGY3X3_IMPL(T)                                        \
+void run_morphology3x3_impl(T out[], const T *in[], int width, int chan, \
+                            const uchar k[], Morphology morphology)      \
+{                                                                        \
+    if (M_ERODE == morphology)                                           \
+    {                                                                    \
+        run_morphology3x3_code<M_ERODE>(out, in, width, chan, k);        \
+    }                                                                    \
+    else if (M_DILATE == morphology)                                     \
+    {                                                                    \
+        run_morphology3x3_code<M_DILATE>(out, in, width, chan, k);       \
+    }                                                                    \
+    else                                                                 \
+        CV_Error(cv::Error::StsBadArg, "unsupported morphology operation"); \
+}
+
+RUN_MORPHOLOGY3X3_IMPL(uchar )
+RUN_MORPHOLOGY3X3_IMPL(ushort)
+RUN_MORPHOLOGY3X3_IMPL( short)
+RUN_MORPHOLOGY3X3_IMPL( float)
+
+#undef RUN_MORPHOLOGY3X3_IMPL
+
 //------------------------------------------------------------------------------
 
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY