From b19697e3ac21309a61b9c145cd396fe1a9dd71ad Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Mon, 15 Nov 2021 20:16:25 +0300 Subject: [PATCH] Merge pull request #20914 from anna-khakimova:ak/simd_div GAPI Fluid: SIMD Div kernel. * HAL implementation for Div kernel * Removed dbg lines * Applied comments. * Reworked * Final version --- modules/gapi/CMakeLists.txt | 2 + modules/gapi/include/opencv2/gapi/core.hpp | 5 +- modules/gapi/perf/common/gapi_core_perf_tests.hpp | 2 +- .../gapi/perf/common/gapi_core_perf_tests_inl.hpp | 17 +- modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp | 3 +- .../gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp | 13 +- modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp | 3 +- modules/gapi/src/backends/fluid/gfluidcore.cpp | 36 +- .../backends/fluid/gfluidcore_func.dispatch.cpp | 63 +++ .../gapi/src/backends/fluid/gfluidcore_func.hpp | 44 ++ .../src/backends/fluid/gfluidcore_func.simd.hpp | 478 +++++++++++++++++++++ 11 files changed, 648 insertions(+), 18 deletions(-) create mode 100644 modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp create mode 100644 modules/gapi/src/backends/fluid/gfluidcore_func.hpp create mode 100644 modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt index d6b02f5..9a3cfa1 100644 --- a/modules/gapi/CMakeLists.txt +++ b/modules/gapi/CMakeLists.txt @@ -123,6 +123,7 @@ set(gapi_srcs src/backends/fluid/gfluidimgproc.cpp src/backends/fluid/gfluidimgproc_func.dispatch.cpp src/backends/fluid/gfluidcore.cpp + src/backends/fluid/gfluidcore_func.dispatch.cpp # OCL Backend (currently built-in) src/backends/ocl/goclbackend.cpp @@ -188,6 +189,7 @@ set(gapi_srcs ) ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2) +ocv_add_dispatched_file(backends/fluid/gfluidcore_func SSE4_1 AVX2) ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/") diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp index e6077df..35f875a 100644 --- a/modules/gapi/include/opencv2/gapi/core.hpp +++ b/modules/gapi/include/opencv2/gapi/core.hpp @@ -770,7 +770,10 @@ GAPI_EXPORTS GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth = The function divides one matrix by another: \f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f] -When src2(I) is zero, dst(I) will also be zero. Different channels of +For integer types when src2(I) is zero, dst(I) will also be zero. +Floating point case returns Inf/NaN (according to IEEE). + +Different channels of multi-channel matrices are processed independently. The matrices can be single or multi channel. Output matrix must have the same size and depth as src. diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index eb6d99d..0ae0210 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -35,7 +35,7 @@ namespace opencv_test class MulPerfTest : public TestPerfParams> {}; class MulDoublePerfTest : public TestPerfParams> {}; class MulCPerfTest : public TestPerfParams> {}; - class DivPerfTest : public TestPerfParams> {}; + class DivPerfTest : public TestPerfParams> {}; class DivCPerfTest : public TestPerfParams> {}; class DivRCPerfTest : public TestPerfParams> {}; class MaskPerfTest : public TestPerfParams> {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index f8f309a..937d49f 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -323,17 +323,23 @@ PERF_TEST_P_(DivPerfTest, TestPerformance) Size sz = get<1>(GetParam()); MatType type = get<2>(GetParam()); int dtype = get<3>(GetParam()); - cv::GCompileArgs compile_args = get<4>(GetParam()); + double scale = get<4>(GetParam()); + cv::GCompileArgs compile_args = get<5>(GetParam()); // FIXIT Unstable input data for divide initMatsRandU(type, sz, dtype, false); + //This condition need to workaround bug in OpenCV. + //It reinitializes divider matrix without zero values. + if (dtype == CV_16S && dtype != type) + cv::randu(in_mat2, cv::Scalar::all(1), cv::Scalar::all(255)); + // OpenCV code /////////////////////////////////////////////////////////// - cv::divide(in_mat1, in_mat2, out_mat_ocv, dtype); + cv::divide(in_mat1, in_mat2, out_mat_ocv, scale, dtype); // G-API code //////////////////////////////////////////////////////////// cv::GMat in1, in2, out; - out = cv::gapi::div(in1, in2, dtype); + out = cv::gapi::div(in1, in2, scale, dtype); cv::GComputation c(GIn(in1, in2), GOut(out)); // Warm-up graph engine: @@ -347,8 +353,9 @@ PERF_TEST_P_(DivPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 121b8ac..51b76fb 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -67,7 +67,8 @@ INSTANTIATE_TEST_CASE_P(DivPerfTestCPU, DivPerfTest, Combine(Values(AbsExact().to_compare_f()), Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), - Values(-1, CV_8U, CV_16U, CV_32F), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), + Values(2.3), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(DivCPerfTestCPU, DivCPerfTest, diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index df3a2ea..ffb46d1 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -60,12 +60,13 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest, // Values(-1, CV_8U, CV_16U, CV_32F), // Values(cv::compile_args(CORE_FLUID)))); -// INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest, -// Combine(Values(AbsExact().to_compare_f()), -// Values(szSmall128, szVGA, sz720p, sz1080p), -// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), -// Values(-1, CV_8U, CV_16U, CV_32F), -// Values(cv::compile_args(CORE_FLUID)))); + INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), + Values(2.3), + Values(cv::compile_args(CORE_FLUID)))); // INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest, // Combine(Values(szSmall128, szVGA, sz720p, sz1080p), diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index 4a38fdb..d2269c0 100644 --- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -62,10 +62,11 @@ INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(DivPerfTestGPU, DivPerfTest, - Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()), + Combine(Values(AbsTolerance(2).to_compare_f()), Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), + Values(2.3), Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(DivCPerfTestGPU, DivCPerfTest, diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 6cf76b2..3e81dfc 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -13,6 +13,10 @@ #include #include +#if CV_SIMD +#include "gfluidcore_func.hpp" +#endif + #include #include @@ -82,14 +86,26 @@ static inline DST mul(SRC1 x, SRC2 y, float scale=1) } template -static inline DST div(SRC1 x, SRC2 y, float scale=1) +static inline +typename std::enable_if::value, DST>::type +div(SRC1 x, SRC2 y, float scale=1) { - // like OpenCV: returns 0, if y=0 + // like OpenCV: returns 0, if DST type=uchar/short/ushort and divider(y)=0 auto result = y? scale * x / y: 0; return saturate(result, rintf); } template +static inline +typename std::enable_if::value, DST>::type +div(SRC1 x, SRC2 y, float scale = 1) +{ + // like OpenCV: returns inf/nan, if DST type=float and divider(y)=0 + auto result = scale * x / y; + return saturate(result, rintf); +} + +template static inline DST divr(SRC1 x, SRC2 y, float scale=1) { auto result = x? scale * y / x: 0; // reverse: y / x @@ -626,7 +642,7 @@ CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int l return 0; } -#endif +#endif // CV_SIMD template static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm, @@ -672,9 +688,14 @@ static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm a out[x] = mul(in1[x], in2[x], _scale); break; case ARITHM_DIVIDE: + { +#if CV_SIMD + x = div_simd(in1, in2, out, length, scale); +#endif for (; x < length; ++x) out[x] = div(in1[x], in2[x], _scale); break; + } default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation"); } } @@ -744,10 +765,19 @@ GAPI_FLUID_KERNEL(GFluidDiv, cv::gapi::core::GDiv, false) { // DST SRC1 SRC2 OP __VA_ARGS__ BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_(uchar, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_( short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_( short, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_( short, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_(ushort, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_(ushort, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_(ushort, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); + BINARY_( float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale); diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp new file mode 100644 index 0000000..814c881 --- /dev/null +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -0,0 +1,63 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2021 Intel Corporation + +#if !defined(GAPI_STANDALONE) + +#include "gfluidcore_func.hpp" +#include "gfluidcore_func.simd.hpp" + +#include "backends/fluid/gfluidcore_func.simd_declarations.hpp" + +#include "gfluidutils.hpp" + +#include +#include + +#include +#include + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wstrict-overflow" +#endif + +namespace cv { +namespace gapi { +namespace fluid { + +#define DIV_SIMD(SRC, DST) \ +int div_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale) \ +{ \ + CV_CPU_DISPATCH(div_simd, (in1, in2, out, length, _scale), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + + +DIV_SIMD(uchar, uchar) +DIV_SIMD(ushort, uchar) +DIV_SIMD(short, uchar) +DIV_SIMD(float, uchar) +DIV_SIMD(short, short) +DIV_SIMD(ushort, short) +DIV_SIMD(uchar, short) +DIV_SIMD(float, short) +DIV_SIMD(ushort, ushort) +DIV_SIMD(uchar, ushort) +DIV_SIMD(short, ushort) +DIV_SIMD(float, ushort) +DIV_SIMD(uchar, float) +DIV_SIMD(ushort, float) +DIV_SIMD(short, float) +DIV_SIMD(float, float) + +#undef DIV_SIMD + +} // namespace fluid +} // namespace gapi +} // namespace cv + +#endif // !defined(GAPI_STANDALONE) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp new file mode 100644 index 0000000..f66aeeb --- /dev/null +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -0,0 +1,44 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2021 Intel Corporation + +#pragma once + +#if !defined(GAPI_STANDALONE) + +#include + +namespace cv { +namespace gapi { +namespace fluid { + +#define DIV_SIMD(SRC, DST) \ +int div_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale); + +DIV_SIMD(uchar, uchar) +DIV_SIMD(ushort, uchar) +DIV_SIMD(short, uchar) +DIV_SIMD(float, uchar) +DIV_SIMD(short, short) +DIV_SIMD(ushort, short) +DIV_SIMD(uchar, short) +DIV_SIMD(float, short) +DIV_SIMD(ushort, ushort) +DIV_SIMD(uchar, ushort) +DIV_SIMD(short, ushort) +DIV_SIMD(float, ushort) +DIV_SIMD(uchar, float) +DIV_SIMD(ushort, float) +DIV_SIMD(short, float) +DIV_SIMD(float, float) + +#undef DIV_SIMD + +} // namespace fluid +} // namespace gapi +} // namespace cv + +#endif // !defined(GAPI_STANDALONE) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp new file mode 100644 index 0000000..b6fd645 --- /dev/null +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -0,0 +1,478 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2021 Intel Corporation + +// NB: allow including this *.hpp several times! +// #pragma once -- don't: this file is NOT once! + +#if !defined(GAPI_STANDALONE) + +#include "opencv2/gapi/own/saturate.hpp" + +#include "opencv2/core.hpp" +#include +#include + +#include +#include + +#include +#include +#include + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wstrict-overflow" +#endif + +using cv::gapi::own::saturate; + +namespace cv { +namespace gapi { +namespace fluid { + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +#define DIV_SIMD(SRC, DST) \ +int div_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale); + +DIV_SIMD(uchar, uchar) +DIV_SIMD(ushort, uchar) +DIV_SIMD(short, uchar) +DIV_SIMD(float, uchar) +DIV_SIMD(short, short) +DIV_SIMD(ushort, short) +DIV_SIMD(uchar, short) +DIV_SIMD(float, short) +DIV_SIMD(ushort, ushort) +DIV_SIMD(uchar, ushort) +DIV_SIMD(short, ushort) +DIV_SIMD(float, ushort) +DIV_SIMD(uchar, float) +DIV_SIMD(ushort, float) +DIV_SIMD(short, float) +DIV_SIMD(float, float) + +#undef DIV_SIMD + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +struct scale_tag {}; +struct not_scale_tag {}; + +template +struct vector_type_of; + +template +using vector_type_of_t = typename vector_type_of::type; + +template<> struct vector_type_of { using type = v_uint8; }; +template<> struct vector_type_of { using type = v_uint16; }; +template<> struct vector_type_of { using type = v_int16; }; + +CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in) +{ + return vx_load(in); +} + +CV_ALWAYS_INLINE v_float32 vg_load_f32(const ushort* in) +{ + return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in))); +} + +CV_ALWAYS_INLINE v_float32 vg_load_f32(const short* in) +{ + return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in))); +} + +CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in) +{ + return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in))); +} + +CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale) +{ + return (a*scale/div); +} + +CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&) +{ + return a / div; +} + +CV_ALWAYS_INLINE void v_store_div(short* dst, v_int32& res1, v_int32& res2) +{ + vx_store(dst, v_pack(res1, res2)); +} + +CV_ALWAYS_INLINE void v_store_div(ushort* dst, v_int32& res1, v_int32& res2) +{ + vx_store(dst, v_pack_u(res1, res2)); +} + +CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero, + const v_int32& res1, const v_int32& res2) +{ + vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2))); +} + +CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero, + const v_int32& res1, const v_int32& res2) +{ + v_uint16 sel = v_reinterpret_as_u16(v_select(div == v_zero, v_zero, v_pack(res1, res2))); + vx_store(dst, sel); +} + +//================================================================================================= + +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value), int>::type +div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_int16 v_zero = vx_setall_s16(0); + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + + v_int16 div = v_reinterpret_as_s16(vx_load(&in2[x])); + + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); + + v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale)); + v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale)); + + v_store_select(&out[x], div, v_zero, r1, r2); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale) +{ + constexpr int nlanes = v_uint8::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + v_int16 v_zero = vx_setall_s16(0); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); + v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); + v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); + + v_int16 div1 = v_reinterpret_as_s16(vx_load(&in2[x])); + v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2])); + + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); + v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); + v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); + + v_int32 sum1 = v_round(div_op(t, a1, fdiv1, scale)), + sum2 = v_round(div_op(t, a2, fdiv2, scale)), + sum3 = v_round(div_op(t, a3, fdiv3, scale)), + sum4 = v_round(div_op(t, a4, fdiv4, scale)); + + v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); + v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); + + vx_store(&out[x], v_pack_u(res1, res2)); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[], + const int length, double _scale) +{ + constexpr int nlanes = v_uint8::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + v_float32 v_zero = vx_setall_f32(0); + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); + v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); + v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); + + v_float32 div1 = vg_load_f32(&in2[x]); + v_float32 div2 = vg_load_f32(&in2[x + nlanes / 4]); + v_float32 div3 = vg_load_f32(&in2[x + nlanes / 2]); + v_float32 div4 = vg_load_f32(&in2[x + 3 * nlanes / 4]); + + v_float32 r1 = div_op(t, a1, div1, scale); + v_float32 r2 = div_op(t, a2, div2, scale); + v_float32 r3 = div_op(t, a3, div3, scale); + v_float32 r4 = div_op(t, a4, div4, scale); + + v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1); + v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2); + v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3); + v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4); + + v_int32 res1 = v_round(sel1); + v_int32 res2 = v_round(sel2); + v_int32 res3 = v_round(sel3); + v_int32 res4 = v_round(sel4); + + vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4))); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +div_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + v_int16 v_zero = vx_setall_s16(0); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + + v_int16 div = v_reinterpret_as_s16(vx_load_expand(&in2[x])); + + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); + + v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale)); + v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale)); + + v_store_select(&out[x], div, v_zero, r1, r2); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + v_float32 v_zero = vx_setall_f32(0); + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + + v_float32 fdiv1 = vg_load_f32(&in2[x]); + v_float32 fdiv2 = vg_load_f32(&in2[x + nlanes / 2]); + + v_float32 r1 = div_op(t, a1, fdiv1, scale); + v_float32 r2 = div_op(t, a2, fdiv2, scale); + + v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1)); + v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2)); + + v_store_div(&out[x], res1, res2); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[], + const int length, double _scale) +{ + constexpr int nlanes = v_float32::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 b1 = vg_load_f32(&in2[x]); + + vx_store(&out[x], div_op(t, a1, b1, scale)); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int div_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[], + const int length, double scale) +{ + hal::div8u(in1, static_cast(length), in2, static_cast(length), + out, static_cast(length), length, 1, &scale); + return length; +} + +template +CV_ALWAYS_INLINE int div_hal(scale_tag_t, const short in1[], const short in2[], short out[], + const int length, double scale) +{ + hal::div16s(in1, static_cast(length), in2, static_cast(length), + out, static_cast(length), length, 1, &scale); + return length; +} + +//------------------------------------------------------------------------------------------------- + +#define DIV_SIMD(SRC, DST) \ +int div_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale) \ +{ \ + int x = 0; \ + float fscale = static_cast(_scale); \ + if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \ + { \ + not_scale_tag t; \ + x = div_hal(t, in1, in2, out, length, _scale); \ + } \ + else \ + { \ + scale_tag t; \ + x = div_hal(t, in1, in2, out, length, _scale); \ + } \ + return x; \ +} + +DIV_SIMD(uchar, uchar) +DIV_SIMD(ushort, uchar) +DIV_SIMD(short, uchar) +DIV_SIMD(float, uchar) +DIV_SIMD(short, short) +DIV_SIMD(ushort, short) +DIV_SIMD(uchar, short) +DIV_SIMD(float, short) +DIV_SIMD(ushort, ushort) +DIV_SIMD(uchar, ushort) +DIV_SIMD(short, ushort) +DIV_SIMD(float, ushort) +DIV_SIMD(uchar, float) +DIV_SIMD(ushort, float) +DIV_SIMD(short, float) +DIV_SIMD(float, float) + +#undef DIV_SIMD + +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +} // namespace fluid +} // namespace gapi +} // namespace cv + +#endif // !defined(GAPI_STANDALONE) -- 2.7.4