Merge pull request #13329 from elatkin:el/gapi_perf_medblur
authorEvgeny Latkin <evgeny.latkin@intel.com>
Thu, 29 Nov 2018 15:02:29 +0000 (18:02 +0300)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Thu, 29 Nov 2018 15:02:29 +0000 (18:02 +0300)
GAPI (fluid): Median blur optimization (#13329)

* GAPI (fluid): Median blur optimization: reference 3x3

* GAPI (fluid): Median blur optimization: CPU dispatcher

* GAPI (fluid): Median blur optimization: manual CV_SIMD

modules/gapi/src/backends/fluid/gfluidimgproc.cpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

index df44528..de8beb1 100644 (file)
@@ -1442,7 +1442,9 @@ static void run_medianblur(      Buffer& dst,
                            const View  & src,
                                  int     ksize)
 {
-    static const int kmax = 9;
+    static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");
+
+    constexpr int kmax = 9;
     GAPI_Assert(ksize <= kmax);
 
     const SRC *in[ kmax ];
@@ -1460,24 +1462,33 @@ static void run_medianblur(      Buffer& dst,
     int width = dst.length();
     int chan  = dst.meta().chan;
 
-    for (int w=0; w < width; w++)
+    // optimized: if 3x3
+
+    if (3 == ksize)
     {
-        // TODO: make this cycle innermost
-        for (int c=0; c < chan; c++)
-        {
-            SRC neighbours[kmax * kmax];
+        run_medblur3x3_impl(out, in, width, chan);
+        return;
+    }
 
-            for (int i=0; i < ksize; i++)
-            for (int j=0; j < ksize; j++)
-            {
-                neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c];
-            }
+    // reference: any ksize
+
+    int length = width * chan;
+    int klength = ksize * ksize;
+    int klenhalf = klength / 2;
 
-            int length = ksize * ksize;
-            std::nth_element(neighbours, neighbours + length/2, neighbours + length);
+    for (int l=0; l < length; l++)
+    {
+        SRC neighbours[kmax * kmax];
 
-            out[w*chan + c] = saturate<DST>(neighbours[length/2], rintf);
+        for (int i=0; i < ksize; i++)
+        for (int j=0; j < ksize; j++)
+        {
+            neighbours[i*ksize + j] = in[i][l + (j - border)*chan];
         }
+
+        std::nth_element(neighbours, neighbours + klenhalf, neighbours + klength);
+
+        out[l] = saturate<DST>(neighbours[klenhalf], rintf);
     }
 }
 
index ccebc3f..835fb82 100644 (file)
@@ -134,6 +134,26 @@ RUN_MORPHOLOGY3X3_IMPL( float)
 
 #undef RUN_MORPHOLOGY3X3_IMPL
 
+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+#define RUN_MEDBLUR3X3_IMPL(T)                                        \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
+{                                                                     \
+    CV_CPU_DISPATCH(run_medblur3x3_impl, (out, in, width, chan),      \
+        CV_CPU_DISPATCH_MODES_ALL);                                   \
+}
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 } // namespace fliud
 } // namespace gapi
 } // namespace cv
index 0fd8b65..191ac08 100644 (file)
@@ -99,6 +99,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)
 
 #undef RUN_MORPHOLOGY3X3_IMPL
 
+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+#define RUN_MEDBLUR3X3_IMPL(T) \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
index 79b474e..397d3b0 100644 (file)
@@ -117,6 +117,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)
 
 #undef RUN_MORPHOLOGY3X3_IMPL
 
+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+#define RUN_MEDBLUR3X3_IMPL(T) \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 //----------------------------------------------------------------------
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@@ -1580,6 +1596,180 @@ RUN_MORPHOLOGY3X3_IMPL( float)
 
 #undef RUN_MORPHOLOGY3X3_IMPL
 
+//---------------------------
+//
+// Fluid kernels: Median blur
+//
+//---------------------------
+
+template<typename T>
+static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
+{
+    constexpr int ksize = 3;
+    constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    for (int l=0; l < length; l++)
+    {
+        T t[3][3];
+
+        // neighbourhood 3x3
+        t[0][0] = in[0][l - shift];    t[0][1] = in[0][l];    t[0][2] = in[0][l + shift];
+        t[1][0] = in[1][l - shift];    t[1][1] = in[1][l];    t[1][2] = in[1][l + shift];
+        t[2][0] = in[2][l - shift];    t[2][1] = in[2][l];    t[2][2] = in[2][l + shift];
+
+        // sort 2 values
+        auto sort = [](T& a, T& b)
+        {
+            T u=a, v=b;
+            a = (std::min)(u, v);
+            b = (std::max)(u, v);
+        };
+
+        // horizontal: 3-elements bubble-sort per each row
+        sort(t[0][0], t[0][1]);    sort(t[0][1], t[0][2]);    sort(t[0][0], t[0][1]);
+        sort(t[1][0], t[1][1]);    sort(t[1][1], t[1][2]);    sort(t[1][0], t[1][1]);
+        sort(t[2][0], t[2][1]);    sort(t[2][1], t[2][2]);    sort(t[2][0], t[2][1]);
+
+        // vertical: columns bubble-sort (although partial)
+        sort(t[0][0], t[1][0]);    sort(t[0][1], t[1][1]);  /*sort(t[0][2], t[1][2]);*/
+        sort(t[1][0], t[2][0]);    sort(t[1][1], t[2][1]);    sort(t[1][2], t[2][2]);
+      /*sort(t[0][0], t[1][0]);*/  sort(t[0][1], t[1][1]);    sort(t[0][2], t[1][2]);
+
+        // diagonal: bubble-sort (in opposite order!)
+        sort(t[1][1], t[0][2]);    sort(t[2][0], t[1][1]);    sort(t[1][1], t[0][2]);
+
+        out[l] = t[1][1];
+    }
+}
+
+#if CV_SIMD
+template<typename VT, typename T>
+static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
+{
+    constexpr int ksize = 3;
+    constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = VT::nlanes;
+
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            VT t00, t01, t02, t10, t11, t12, t20, t21, t22;
+
+            // neighbourhood 3x3
+
+            t00 = vx_load(&in[0][l - shift]);
+            t01 = vx_load(&in[0][l        ]);
+            t02 = vx_load(&in[0][l + shift]);
+
+            t10 = vx_load(&in[1][l - shift]);
+            t11 = vx_load(&in[1][l        ]);
+            t12 = vx_load(&in[1][l + shift]);
+
+            t20 = vx_load(&in[2][l - shift]);
+            t21 = vx_load(&in[2][l        ]);
+            t22 = vx_load(&in[2][l + shift]);
+
+            // sort 2 values
+            auto sort = [](VT& a, VT& b)
+            {
+                VT u=a, v=b;
+                a = v_min(u, v);
+                b = v_max(u, v);
+            };
+
+            // horizontal: 3-elements bubble-sort per each row
+            sort(t00, t01);    sort(t01, t02);    sort(t00, t01);
+            sort(t10, t11);    sort(t11, t12);    sort(t10, t11);
+            sort(t20, t21);    sort(t21, t22);    sort(t20, t21);
+
+            // vertical: columns bubble-sort (although partial)
+            sort(t00, t10);    sort(t01, t11);  /*sort(t02, t12);*/
+            sort(t10, t20);    sort(t11, t21);    sort(t12, t22);
+          /*sort(t00, t10);*/  sort(t01, t11);    sort(t02, t12);
+
+            // diagonal: bubble-sort (in opposite order!)
+            sort(t11, t02);    sort(t20, t11);    sort(t11, t02);
+
+            v_store(&out[l], t11);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+#endif
+
+template<typename T>
+static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void) length;
+
+    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    {
+        run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
+                                       reinterpret_cast<const float**>(in),
+                                       width, chan);
+        return;
+    }
+
+    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    {
+        run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
+                                     reinterpret_cast<const short**>(in),
+                                     width, chan);
+        return;
+    }
+
+    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    {
+        run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
+                                      reinterpret_cast<const ushort**>(in),
+                                      width, chan);
+        return;
+    }
+
+    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    {
+        run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
+                                     reinterpret_cast<const uchar**>(in),
+                                     width, chan);
+        return;
+    }
+#endif
+
+    run_medblur3x3_reference(out, in, width, chan);
+}
+
+#define RUN_MEDBLUR3X3_IMPL(T)                                        \
+void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
+{                                                                     \
+    run_medblur3x3_code(out, in, width, chan);                        \
+}
+
+RUN_MEDBLUR3X3_IMPL(uchar )
+RUN_MEDBLUR3X3_IMPL(ushort)
+RUN_MEDBLUR3X3_IMPL( short)
+RUN_MEDBLUR3X3_IMPL( float)
+
+#undef RUN_MEDBLUR3X3_IMPL
+
 //------------------------------------------------------------------------------
 
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY