From 6808d33b2f2829ea081cd1cef1c2fec40fcaf63e Mon Sep 17 00:00:00 2001 From: Evgeny Latkin Date: Tue, 27 Nov 2018 19:12:14 +0300 Subject: [PATCH] Merge pull request #13290 from elatkin:el/gapi_perf_filter2d GAPI (fluid): Filter 2D optimization (#13290) * GAPI (fluid): Filter 2D optimization: speedup 13x if float, 2x if integral * GAPI (fluid): Filter 2D speedup 8x if output is short/ushort * GAPI (fluid): Filter 2D speedup 7x if output is uchar * GAPI (fluid): Filter 2D optimization: fixed compiler warnings * GAPI (fluid): fix compiler warnings on Mac * GAPI (fluid): fix compiler warnings on Mac * GAPI (fluid): fix compiler errors on VS2015 * GAPI (fluid): fix compiler errors on VS2015 * GAPI (fluid): fix compiler errors on VS2015 --- modules/gapi/src/backends/fluid/gfluidimgproc.cpp | 40 ++-- .../backends/fluid/gfluidimgproc_func.dispatch.cpp | 25 ++ .../gapi/src/backends/fluid/gfluidimgproc_func.hpp | 20 ++ .../src/backends/fluid/gfluidimgproc_func.simd.hpp | 256 +++++++++++++++++++++ 4 files changed, 327 insertions(+), 14 deletions(-) diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp index 6b27b13..60d6636 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp @@ -1052,24 +1052,30 @@ static void run_filter2d(Buffer& dst, const View& src, int width = dst.length(); int chan = dst.meta().chan; + int length = width * chan; - for (int w=0; w < width; w++) + // manually optimized for 3x3 + if (k_rows == 3 && k_cols == 3) { - // TODO: make this cycle innermost - for (int c=0; c < chan; c++) - { - float sum = 0; - - for (int i=0; i < k_rows; i++) - for (int j=0; j < k_cols; j++) - { - sum += in[i][(w + j - border_x)*chan + c] * k[k_cols*i + j]; - } + float scale = 1; + run_filter2d_3x3_impl(out, in, width, chan, k, scale, delta); + return; + } - float result = sum + delta; + // reference: any kernel size + for (int l=0; l < length; l++) + { + float sum = 0; - out[w*chan + c] = saturate(result, rintf); + for (int i=0; i < k_rows; i++) + for (int j=0; j < k_cols; j++) + { + sum += in[i][l + (j - border_x)*chan] * k[k_cols*i + j]; } + + float result = sum + delta; + + out[l] = saturate(result, rintf); } } @@ -1097,6 +1103,7 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true) int k_rows = kernel.rows; int k_cols = kernel.cols; + const float *k = scratch.OutLine(); // copy of kernel.data // DST SRC OP __VA_ARGS__ @@ -1120,7 +1127,12 @@ GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true) const cv::Scalar & /* borderValue */, Buffer & scratch) { - cv::gapi::own::Size bufsize(kernel.rows * kernel.cols, 1); + int krows = kernel.rows; + int kcols = kernel.cols; + + int buflen = krows * kcols; // kernel size + + cv::gapi::own::Size bufsize(buflen, 1); GMatDesc bufdesc = {CV_32F, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp index b536bbf..0227a0a 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp @@ -86,6 +86,31 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta) \ +{ \ + CV_CPU_DISPATCH(run_filter2d_3x3_impl, \ + (out, in, width, chan, kernel, scale, delta), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + } // namespace fliud } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp index 3b41c52..db5aeda 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp @@ -57,6 +57,26 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta); + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp index cd52b66..821a0ad 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp @@ -17,6 +17,7 @@ #include "opencv2/core/hal/intrin.hpp" #include +#include #include @@ -76,6 +77,26 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta); + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + //---------------------------------------------------------------------- #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY @@ -843,6 +864,241 @@ RUN_SEPFILTER3X3_IMPL( float, float) #undef RUN_SEPFILTER3X3_IMPL +//------------------------- +// +// Fluid kernels: Filter 2D +// +//------------------------- + +template +static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ + static constexpr int ksize = 3; + static constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + const float k[3][3] = {{ kernel[0], kernel[1], kernel[2] }, + { kernel[3], kernel[4], kernel[5] }, + { kernel[6], kernel[7], kernel[8] }}; + + for (int l=0; l < length; l++) + { + float sum = in[0][l - shift] * k[0][0] + in[0][l] * k[0][1] + in[0][l + shift] * k[0][2] + + in[1][l - shift] * k[1][0] + in[1][l] * k[1][1] + in[1][l + shift] * k[1][2] + + in[2][l - shift] * k[2][0] + in[2][l] * k[2][1] + in[2][l + shift] * k[2][2]; + + if (!noscale) + { + sum = sum*scale + delta; + } + + out[l] = saturate(sum, rintf); + } +} + +#if CV_SIMD +// assume DST is short or ushort +template +static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ + static constexpr int ksize = 3; + static constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + const float k[3][3] = { + { kernel[0], kernel[1], kernel[2] }, + { kernel[3], kernel[4], kernel[5] }, + { kernel[6], kernel[7], kernel[8] } + }; + + for (int l=0; l < length;) + { + static constexpr int nlanes = v_int16::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + auto sumx = [in, shift, &k](int i, int j) + { + v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]); + s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s); + s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s); + return s; + }; + + int l0 = l; + int l1 = l + nlanes/2; + v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0); + v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1); + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_int32 res0 = v_round(sum0); + v_int32 res1 = v_round(sum1); + + if (std::is_same::value) + { + v_uint16 res = v_pack_u(res0, res1); + v_store(reinterpret_cast(&out[l]), res); + } + else // if DST == short + { + v_int16 res = v_pack(res0, res1); + v_store(reinterpret_cast(&out[l]), res); + } + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } +} + +template +static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ + static constexpr int ksize = 3; + static constexpr int border = (ksize - 1) / 2; + + const int length = width * chan; + const int shift = border * chan; + + const float k[3][3] = { + { kernel[0], kernel[1], kernel[2] }, + { kernel[3], kernel[4], kernel[5] }, + { kernel[6], kernel[7], kernel[8] } + }; + + for (int l=0; l < length;) + { + static constexpr int nlanes = v_uint8::nlanes; + + // main part of output row + for (; l <= length - nlanes; l += nlanes) + { + auto sumx = [in, shift, &k](int i, int j) + { + v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]); + s = v_fma(vx_load_f32(&in[i][j ]), vx_setall_f32(k[i][1]), s); + s = v_fma(vx_load_f32(&in[i][j + shift]), vx_setall_f32(k[i][2]), s); + return s; + }; + + int l0 = l; + int l1 = l + nlanes/4; + int l2 = l + 2*nlanes/4; + int l3 = l + 3*nlanes/4; + v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0); + v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1); + v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2); + v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3); + + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); + sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_int32 res0 = v_round(sum0); + v_int32 res1 = v_round(sum1); + v_int32 res2 = v_round(sum2); + v_int32 res3 = v_round(sum3); + + v_int16 resl = v_pack(res0, res1); + v_int16 resh = v_pack(res2, res3); + v_uint8 res = v_pack_u(resl, resh); + + v_store(&out[l], res); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } +} +#endif + +template +static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan, + const float kernel[], float scale, float delta) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void) length; + + if (std::is_same::value && length >= v_int16::nlanes) + { + run_filter2d_3x3_any2short(reinterpret_cast(out), in, + width, chan, kernel, scale, delta); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) + { + run_filter2d_3x3_any2short(reinterpret_cast(out), in, + width, chan, kernel, scale, delta); + return; + } + + + if (std::is_same::value && length >= v_uint8::nlanes) + { + run_filter2d_3x3_any2char(reinterpret_cast(out), in, + width, chan, kernel, scale, delta); + return; + } +#endif // CV_SIMD + + run_filter2d_3x3_reference(out, in, width, chan, kernel, scale, delta); +} + +#define RUN_FILTER2D_3X3_IMPL(DST, SRC) \ +void run_filter2d_3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kernel[], float scale, float delta) \ +{ \ + if (scale == 1 && delta == 0) \ + { \ + constexpr bool noscale = true; \ + run_filter2d_3x3_code(out, in, width, chan, kernel, scale, delta); \ + } \ + else \ + { \ + constexpr bool noscale = false; \ + run_filter2d_3x3_code(out, in, width, chan, kernel, scale, delta); \ + } \ +} + +RUN_FILTER2D_3X3_IMPL(uchar , uchar ) +RUN_FILTER2D_3X3_IMPL(ushort, ushort) +RUN_FILTER2D_3X3_IMPL( short, short) +RUN_FILTER2D_3X3_IMPL( float, uchar ) +RUN_FILTER2D_3X3_IMPL( float, ushort) +RUN_FILTER2D_3X3_IMPL( float, short) +RUN_FILTER2D_3X3_IMPL( float, float) + +#undef RUN_FILTER2D_3X3_IMPL + //------------------------------------------------------------------------------ #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -- 2.7.4