From 6808d33b2f2829ea081cd1cef1c2fec40fcaf63e Mon Sep 17 00:00:00 2001
From: Evgeny Latkin <evgeny.latkin@intel.com>
Date: Tue, 27 Nov 2018 19:12:14 +0300
Subject: [PATCH] Merge pull request #13290 from elatkin:el/gapi_perf_filter2d

GAPI (fluid): Filter 2D optimization (#13290)

* GAPI (fluid): Filter 2D optimization: speedup 13x if float, 2x if integral

* GAPI (fluid): Filter 2D speedup 8x if output is short/ushort

* GAPI (fluid): Filter 2D speedup 7x if output is uchar

* GAPI (fluid): Filter 2D optimization: fixed compiler warnings

* GAPI (fluid): fix compiler warnings on Mac

* GAPI (fluid): fix compiler warnings on Mac

* GAPI (fluid): fix compiler errors on VS2015

* GAPI (fluid): fix compiler errors on VS2015

* GAPI (fluid): fix compiler errors on VS2015
---
 modules/gapi/src/backends/fluid/gfluidimgproc.cpp  |  40 ++--
 .../backends/fluid/gfluidimgproc_func.dispatch.cpp |  25 ++
 .../gapi/src/backends/fluid/gfluidimgproc_func.hpp |  20 ++
 .../src/backends/fluid/gfluidimgproc_func.simd.hpp | 256 +++++++++++++++++++++
 4 files changed, 327 insertions(+), 14 deletions(-)
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
index 6b27b13..60d6636 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@@ -1052,24 +1052,30 @@ static void run_filter2d(Buffer& dst, const View& src,
 
     int width = dst.length();
     int chan  = dst.meta().chan;
+    int length = width * chan;
 
-    for (int w=0; w < width; w++)
+    // manually optimized for 3x3
+    if (k_rows == 3 && k_cols == 3)
     {
-        // TODO: make this cycle innermost
-        for (int c=0; c < chan; c++)
-        {
-            float sum = 0;
-
-            for (int i=0; i < k_rows; i++)
-            for (int j=0; j < k_cols; j++)
-            {
-                sum += in[i][(w + j - border_x)*chan + c] * k[k_cols*i + j];
-            }
+        float scale = 1;
+        run_filter2d_3x3_impl(out, in, width, chan, k, scale, delta);
+        return;
+    }
 
-            float result = sum + delta;
+    // reference: any kernel size
+    for (int l=0; l < length; l++)
+    {
+        float sum = 0;
 
-            out[w*chan + c] = saturate<DST>(result, rintf);
+        for (int i=0; i < k_rows; i++)
+        for (int j=0; j < k_cols; j++)
+        {
+            sum += in[i][l + (j - border_x)*chan] * k[k_cols*i + j];
         }
+
+        float result = sum + delta;
+
+        out[l] = saturate<DST>(result, rintf);
     }
 }
 
@@ -1097,6 +1103,7 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
 
         int k_rows = kernel.rows;
         int k_cols = kernel.cols;
+
         const float *k = scratch.OutLine<float>(); // copy of kernel.data
 
         //     DST     SRC     OP            __VA_ARGS__
@@ -1120,7 +1127,12 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
                             const cv::Scalar  & /* borderValue */,
                                       Buffer  &    scratch)
     {
-        cv::gapi::own::Size bufsize(kernel.rows * kernel.cols, 1);
+        int krows = kernel.rows;
+        int kcols = kernel.cols;
+
+        int buflen = krows * kcols;  // kernel size
+
+        cv::gapi::own::Size bufsize(buflen, 1);
         GMatDesc bufdesc = {CV_32F, 1, bufsize};
         Buffer buffer(bufdesc);
         scratch = std::move(buffer);
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
index b536bbf..0227a0a 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@@ -86,6 +86,31 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+//-------------------------
+//
+// Fluid kernels: Filter 2D
+//
+//-------------------------
+
+#define RUN_FILTER2D_3X3_IMPL(DST, SRC)                                     \
+void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kernel[], float scale, float delta)  \
+{                                                                           \
+    CV_CPU_DISPATCH(run_filter2d_3x3_impl,                                  \
+        (out, in, width, chan, kernel, scale, delta),                       \
+        CV_CPU_DISPATCH_MODES_ALL);                                         \
+}
+
+RUN_FILTER2D_3X3_IMPL(uchar , uchar )
+RUN_FILTER2D_3X3_IMPL(ushort, ushort)
+RUN_FILTER2D_3X3_IMPL( short,  short)
+RUN_FILTER2D_3X3_IMPL( float, uchar )
+RUN_FILTER2D_3X3_IMPL( float, ushort)
+RUN_FILTER2D_3X3_IMPL( float,  short)
+RUN_FILTER2D_3X3_IMPL( float,  float)
+
+#undef RUN_FILTER2D_3X3_IMPL
+
 } // namespace fliud
 } // namespace gapi
 } // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
index 3b41c52..db5aeda 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@@ -57,6 +57,26 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+//-------------------------
+//
+// Fluid kernels: Filter 2D
+//
+//-------------------------
+
+#define RUN_FILTER2D_3X3_IMPL(DST, SRC)                                     \
+void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kernel[], float scale, float delta);
+
+RUN_FILTER2D_3X3_IMPL(uchar , uchar )
+RUN_FILTER2D_3X3_IMPL(ushort, ushort)
+RUN_FILTER2D_3X3_IMPL( short,  short)
+RUN_FILTER2D_3X3_IMPL( float, uchar )
+RUN_FILTER2D_3X3_IMPL( float, ushort)
+RUN_FILTER2D_3X3_IMPL( float,  short)
+RUN_FILTER2D_3X3_IMPL( float,  float)
+
+#undef RUN_FILTER2D_3X3_IMPL
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index cd52b66..821a0ad 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -17,6 +17,7 @@
 #include "opencv2/core/hal/intrin.hpp"
 
 #include <cstdint>
+#include <cstring>
 
 #include <vector>
 
@@ -76,6 +77,26 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+//-------------------------
+//
+// Fluid kernels: Filter 2D
+//
+//-------------------------
+
+#define RUN_FILTER2D_3X3_IMPL(DST, SRC)                                     \
+void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kernel[], float scale, float delta);
+
+RUN_FILTER2D_3X3_IMPL(uchar , uchar )
+RUN_FILTER2D_3X3_IMPL(ushort, ushort)
+RUN_FILTER2D_3X3_IMPL( short,  short)
+RUN_FILTER2D_3X3_IMPL( float, uchar )
+RUN_FILTER2D_3X3_IMPL( float, ushort)
+RUN_FILTER2D_3X3_IMPL( float,  short)
+RUN_FILTER2D_3X3_IMPL( float,  float)
+
+#undef RUN_FILTER2D_3X3_IMPL
+
 //----------------------------------------------------------------------
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@@ -843,6 +864,241 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+//-------------------------
+//
+// Fluid kernels: Filter 2D
+//
+//-------------------------
+
+template<bool noscale, typename DST, typename SRC>
+static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, int chan,
+                                       const float kernel[], float scale, float delta)
+{
+    static constexpr int ksize = 3;
+    static constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float k[3][3] = {{ kernel[0], kernel[1], kernel[2] },
+                           { kernel[3], kernel[4], kernel[5] },
+                           { kernel[6], kernel[7], kernel[8] }};
+
+    for (int l=0; l < length; l++)
+    {
+        float sum = in[0][l - shift] * k[0][0] + in[0][l] * k[0][1] + in[0][l + shift] * k[0][2]
+                  + in[1][l - shift] * k[1][0] + in[1][l] * k[1][1] + in[1][l + shift] * k[1][2]
+                  + in[2][l - shift] * k[2][0] + in[2][l] * k[2][1] + in[2][l + shift] * k[2][2];
+
+        if (!noscale)
+        {
+            sum = sum*scale + delta;
+        }
+
+        out[l] = saturate<DST>(sum, rintf);
+    }
+}
+
+#if CV_SIMD
+// assume DST is short or ushort
+template<bool noscale, typename DST, typename SRC>
+static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
+                                       const float kernel[], float scale, float delta)
+{
+    static constexpr int ksize = 3;
+    static constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float k[3][3] = {
+        { kernel[0], kernel[1], kernel[2] },
+        { kernel[3], kernel[4], kernel[5] },
+        { kernel[6], kernel[7], kernel[8] }
+    };
+
+    for (int l=0; l < length;)
+    {
+        static constexpr int nlanes = v_int16::nlanes;
+
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            auto sumx = [in, shift, &k](int i, int j)
+            {
+                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                    s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
+                    s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
+                return s;
+            };
+
+            int l0 = l;
+            int l1 = l + nlanes/2;
+            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
+            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
+
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_int32 res0 = v_round(sum0);
+            v_int32 res1 = v_round(sum1);
+
+            if (std::is_same<DST, ushort>::value)
+            {
+                v_uint16 res = v_pack_u(res0, res1);
+                v_store(reinterpret_cast<ushort*>(&out[l]), res);
+            }
+            else // if DST == short
+            {
+                v_int16 res = v_pack(res0, res1);
+                v_store(reinterpret_cast<short*>(&out[l]), res);
+            }
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+
+template<bool noscale, typename SRC>
+static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
+                                      const float kernel[], float scale, float delta)
+{
+    static constexpr int ksize = 3;
+    static constexpr int border = (ksize - 1) / 2;
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float k[3][3] = {
+        { kernel[0], kernel[1], kernel[2] },
+        { kernel[3], kernel[4], kernel[5] },
+        { kernel[6], kernel[7], kernel[8] }
+    };
+
+    for (int l=0; l < length;)
+    {
+        static constexpr int nlanes = v_uint8::nlanes;
+
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            auto sumx = [in, shift, &k](int i, int j)
+            {
+                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                    s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
+                    s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
+                return s;
+            };
+
+            int l0 = l;
+            int l1 = l +   nlanes/4;
+            int l2 = l + 2*nlanes/4;
+            int l3 = l + 3*nlanes/4;
+            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
+            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
+            v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
+            v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
+
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_int32 res0 = v_round(sum0);
+            v_int32 res1 = v_round(sum1);
+            v_int32 res2 = v_round(sum2);
+            v_int32 res3 = v_round(sum3);
+
+            v_int16 resl = v_pack(res0, res1);
+            v_int16 resh = v_pack(res2, res3);
+            v_uint8 res = v_pack_u(resl, resh);
+
+            v_store(&out[l], res);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+#endif
+
+template<bool noscale, typename DST, typename SRC>
+static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
+                                  const float kernel[], float scale, float delta)
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void) length;
+
+    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    {
+        run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
+                                            width, chan, kernel, scale, delta);
+        return;
+    }
+
+    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    {
+        run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
+                                            width, chan, kernel, scale, delta);
+        return;
+    }
+
+
+    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    {
+        run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
+                                           width, chan, kernel, scale, delta);
+        return;
+    }
+#endif  // CV_SIMD
+
+    run_filter2d_3x3_reference<noscale>(out, in, width, chan, kernel, scale, delta);
+}
+
+#define RUN_FILTER2D_3X3_IMPL(DST, SRC)                                             \
+void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan,         \
+                           const float kernel[], float scale, float delta)          \
+{                                                                                   \
+    if (scale == 1 && delta == 0)                                                   \
+    {                                                                               \
+        constexpr bool noscale = true;                                              \
+        run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
+    }                                                                               \
+    else                                                                            \
+    {                                                                               \
+        constexpr bool noscale = false;                                             \
+        run_filter2d_3x3_code<noscale>(out, in, width, chan, kernel, scale, delta); \
+    }                                                                               \
+}
+
+RUN_FILTER2D_3X3_IMPL(uchar , uchar )
+RUN_FILTER2D_3X3_IMPL(ushort, ushort)
+RUN_FILTER2D_3X3_IMPL( short,  short)
+RUN_FILTER2D_3X3_IMPL( float, uchar )
+RUN_FILTER2D_3X3_IMPL( float, ushort)
+RUN_FILTER2D_3X3_IMPL( float,  short)
+RUN_FILTER2D_3X3_IMPL( float,  float)
+
+#undef RUN_FILTER2D_3X3_IMPL
+
 //------------------------------------------------------------------------------
 
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-- 
2.7.4