From: Anna Khakimova Date: Tue, 9 Nov 2021 09:44:11 +0000 (+0300) Subject: Fluid: SIMD multiply kernel X-Git-Tag: accepted/tizen/unified/20230127.161057~1^2~519^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c47673bf10ab620d28de8fe1d1d9b4e264221f0f;p=platform%2Fupstream%2Fopencv.git Fluid: SIMD multiply kernel --- diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index 0ae0210..f5916a6 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -32,7 +32,7 @@ namespace opencv_test class SubPerfTest : public TestPerfParams> {}; class SubCPerfTest : public TestPerfParams> {}; class SubRCPerfTest : public TestPerfParams> {}; - class MulPerfTest : public TestPerfParams> {}; + class MulPerfTest : public TestPerfParams> {}; class MulDoublePerfTest : public TestPerfParams> {}; class MulCPerfTest : public TestPerfParams> {}; class DivPerfTest : public TestPerfParams> {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index 937d49f..fbbda1a 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -208,19 +208,23 @@ PERF_TEST_P_(SubRCPerfTest, TestPerformance) PERF_TEST_P_(MulPerfTest, TestPerformance) { - Size sz = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - int dtype = get<2>(GetParam()); - cv::GCompileArgs compile_args = get<3>(GetParam()); + compare_f cmpF; + cv::Size sz; + MatType type = -1; + int dtype = -1; + double scale = 1.0; + cv::GCompileArgs compile_args; + + std::tie(cmpF, sz, type, dtype, scale, compile_args) = GetParam(); initMatsRandU(type, sz, dtype, false); // OpenCV code /////////////////////////////////////////////////////////// - cv::multiply(in_mat1, in_mat2, out_mat_ocv, 1.0, dtype); + cv::multiply(in_mat1, in_mat2, out_mat_ocv, scale, dtype); // G-API code //////////////////////////////////////////////////////////// cv::GMat in1, in2, out; - out = cv::gapi::mul(in1, in2, 1.0, dtype); + out = cv::gapi::mul(in1, in2, scale, dtype); cv::GComputation c(GIn(in1, in2), GOut(out)); // Warm-up graph engine: @@ -234,8 +238,9 @@ PERF_TEST_P_(MulPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 51b76fb..09196fd 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -46,9 +46,11 @@ INSTANTIATE_TEST_CASE_P(SubRCPerfTestCPU, SubRCPerfTest, Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(-1, CV_8U, CV_16U, CV_32F), + Values(2.0), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest, diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index ffb46d1..6c80231 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -42,11 +42,13 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest, // Values(-1, CV_8U, CV_16U, CV_32F), // Values(cv::compile_args(CORE_FLUID)))); -// INSTANTIATE_TEST_CASE_P(MulPerfTestFluid, MulPerfTest, -// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), -// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), -// Values(-1, CV_8U, CV_16U, CV_32F), -// Values(cv::compile_args(CORE_FLUID)))); + INSTANTIATE_TEST_CASE_P(MulPerfTestFluid, MulPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), + Values(2.0), + Values(cv::compile_args(CORE_FLUID)))); // INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest, // Combine(Values(szSmall128, szVGA, sz720p, sz1080p), diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index d2269c0..0b260bf 100644 --- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -44,9 +44,11 @@ INSTANTIATE_TEST_CASE_P(SubRCPerfTestGPU, SubRCPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsExact().to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), + Values(2.0), Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest, diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 3e81dfc..d68ae73 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -684,9 +684,14 @@ static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm a break; } case ARITHM_MULTIPLY: + { +#if CV_SIMD + x = mul_simd(in1, in2, out, length, scale); +#endif for (; x < length; ++x) out[x] = mul(in1[x], in2[x], _scale); break; + } case ARITHM_DIVIDE: { #if CV_SIMD @@ -745,13 +750,22 @@ GAPI_FLUID_KERNEL(GFluidMul, cv::gapi::core::GMul, false) static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst) { // DST SRC1 SRC2 OP __VA_ARGS__ - BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); - BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); - BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); - BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); - BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); - BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); - BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(uchar, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(uchar, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(uchar, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(uchar, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(short, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(short, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(short, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(ushort, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(ushort, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(ushort, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(float, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(float, short, short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); + BINARY_(float, float, float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 814c881..297c065 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -56,6 +56,35 @@ DIV_SIMD(float, float) #undef DIV_SIMD + +#define MUL_SIMD(SRC, DST) \ +int mul_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale) \ +{ \ + CV_CPU_DISPATCH(mul_simd, (in1, in2, out, length, _scale), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + + +MUL_SIMD(uchar, uchar) +MUL_SIMD(ushort, uchar) +MUL_SIMD(short, uchar) +MUL_SIMD(float, uchar) +MUL_SIMD(short, short) +MUL_SIMD(ushort, short) +MUL_SIMD(uchar, short) +MUL_SIMD(float, short) +MUL_SIMD(ushort, ushort) +MUL_SIMD(uchar, ushort) +MUL_SIMD(short, ushort) +MUL_SIMD(float, ushort) +MUL_SIMD(uchar, float) +MUL_SIMD(ushort, float) +MUL_SIMD(short, float) +MUL_SIMD(float, float) + +#undef MUL_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index f66aeeb..3ae41c6 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -37,6 +37,29 @@ DIV_SIMD(float, float) #undef DIV_SIMD +#define MUL_SIMD(SRC, DST) \ +int mul_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale); + +MUL_SIMD(uchar, uchar) +MUL_SIMD(ushort, uchar) +MUL_SIMD(short, uchar) +MUL_SIMD(float, uchar) +MUL_SIMD(short, short) +MUL_SIMD(ushort, short) +MUL_SIMD(uchar, short) +MUL_SIMD(float, short) +MUL_SIMD(ushort, ushort) +MUL_SIMD(uchar, ushort) +MUL_SIMD(short, ushort) +MUL_SIMD(float, ushort) +MUL_SIMD(uchar, float) +MUL_SIMD(ushort, float) +MUL_SIMD(short, float) +MUL_SIMD(float, float) + +#undef MUL_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index b6fd645..5139d54 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -58,6 +58,29 @@ DIV_SIMD(float, float) #undef DIV_SIMD +#define MUL_SIMD(SRC, DST) \ +int mul_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale); + +MUL_SIMD(uchar, uchar) +MUL_SIMD(ushort, uchar) +MUL_SIMD(short, uchar) +MUL_SIMD(float, uchar) +MUL_SIMD(short, short) +MUL_SIMD(ushort, short) +MUL_SIMD(uchar, short) +MUL_SIMD(float, short) +MUL_SIMD(ushort, ushort) +MUL_SIMD(uchar, ushort) +MUL_SIMD(short, ushort) +MUL_SIMD(float, ushort) +MUL_SIMD(uchar, float) +MUL_SIMD(ushort, float) +MUL_SIMD(short, float) +MUL_SIMD(float, float) + +#undef MUL_SIMD + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -93,6 +116,16 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in) return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in))); } +CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale) +{ + return (scale*a * b); +} + +CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&) +{ + return a * b; +} + CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale) { return (a*scale/div); @@ -103,12 +136,12 @@ CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_flo return a / div; } -CV_ALWAYS_INLINE void v_store_div(short* dst, v_int32& res1, v_int32& res2) +CV_ALWAYS_INLINE void v_store_i16(short* dst, v_int32& res1, v_int32& res2) { vx_store(dst, v_pack(res1, res2)); } -CV_ALWAYS_INLINE void v_store_div(ushort* dst, v_int32& res1, v_int32& res2) +CV_ALWAYS_INLINE void v_store_i16(ushort* dst, v_int32& res1, v_int32& res2) { vx_store(dst, v_pack_u(res1, res2)); } @@ -360,7 +393,7 @@ div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const in v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1)); v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2)); - v_store_div(&out[x], res1, res2); + v_store_i16(&out[x], res1, res2); } if (x < length) @@ -467,6 +500,327 @@ DIV_SIMD(float, float) #undef DIV_SIMD +//------------------------- +// +// Fluid kernels: Multiply +// +//------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value), int>::type +mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_int16 a = v_reinterpret_as_s16(vx_load(&in1[x])); + v_int16 b = v_reinterpret_as_s16(vx_load(&in2[x])); + + v_float32 a1 = v_cvt_f32(v_expand_low(a)); + v_float32 a2 = v_cvt_f32(v_expand_high(a)); + + v_float32 b1 = v_cvt_f32(v_expand_low(b)); + v_float32 b2 = v_cvt_f32(v_expand_high(b)); + + v_int32 r1 = v_round(mul_op(t, a1, b1, scale)); + v_int32 r2 = v_round(mul_op(t, a2, b2, scale)); + + v_store_i16(&out[x], r1, r2); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale) +{ + constexpr int nlanes = v_uint8::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_int16 a1 = v_reinterpret_as_s16(vx_load(&in1[x])); + v_int16 a2 = v_reinterpret_as_s16(vx_load(&in1[x + nlanes / 2])); + + v_float32 fa1 = v_cvt_f32(v_expand_low(a1)); + v_float32 fa2 = v_cvt_f32(v_expand_high(a1)); + v_float32 fa3 = v_cvt_f32(v_expand_low(a2)); + v_float32 fa4 = v_cvt_f32(v_expand_high(a2)); + + v_int16 b1 = v_reinterpret_as_s16(vx_load(&in2[x])); + v_int16 b2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2])); + + v_float32 fb1 = v_cvt_f32(v_expand_low(b1)); + v_float32 fb2 = v_cvt_f32(v_expand_high(b1)); + v_float32 fb3 = v_cvt_f32(v_expand_low(b2)); + v_float32 fb4 = v_cvt_f32(v_expand_high(b2)); + + v_int32 sum1 = v_round(mul_op(t, fa1, fb1, scale)), + sum2 = v_round(mul_op(t, fa2, fb2, scale)), + sum3 = v_round(mul_op(t, fa3, fb3, scale)), + sum4 = v_round(mul_op(t, fa4, fb4, scale)); + + v_int16 res1 = v_pack(sum1, sum2); + v_int16 res2 = v_pack(sum3, sum4); + + vx_store(&out[x], v_pack_u(res1, res2)); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[], + const int length, double _scale) +{ + constexpr int nlanes = v_uint8::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); + v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); + v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); + + v_float32 b1 = vg_load_f32(&in2[x]); + v_float32 b2 = vg_load_f32(&in2[x + nlanes / 4]); + v_float32 b3 = vg_load_f32(&in2[x + nlanes / 2]); + v_float32 b4 = vg_load_f32(&in2[x + 3 * nlanes / 4]); + + v_int32 res1 = v_round(mul_op(t, a1, b1, scale)); + v_int32 res2 = v_round(mul_op(t, a2, b2, scale)); + v_int32 res3 = v_round(mul_op(t, a3, b3, scale)); + v_int32 res4 = v_round(mul_op(t, a4, b4, scale)); + + vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4))); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_int16 a = v_reinterpret_as_s16(vx_load_expand(&in1[x])); + v_int16 b = v_reinterpret_as_s16(vx_load_expand(&in2[x])); + + v_float32 a1 = v_cvt_f32(v_expand_low(a)); + v_float32 a2 = v_cvt_f32(v_expand_high(a)); + + v_float32 b1 = v_cvt_f32(v_expand_low(b)); + v_float32 b2 = v_cvt_f32(v_expand_high(b)); + + v_int32 r1 = v_round(mul_op(t, a1, b1, scale)); + v_int32 r2 = v_round(mul_op(t, a2, b2, scale)); + + v_store_i16(&out[x], r1, r2); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + + v_float32 b1 = vg_load_f32(&in2[x]); + v_float32 b2 = vg_load_f32(&in2[x + nlanes / 2]); + + v_int32 res1 = v_round(mul_op(t, a1, b1, scale)); + v_int32 res2 = v_round(mul_op(t, a2, b2, scale)); + + v_store_i16(&out[x], res1, res2); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[], + const int length, double _scale) +{ + constexpr int nlanes = v_float32::nlanes; + + if (length < nlanes) + return 0; + + v_float32 scale = vx_setall_f32(static_cast(_scale)); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = vg_load_f32(&in1[x]); + v_float32 b1 = vg_load_f32(&in2[x]); + + vx_store(&out[x], mul_op(t, a1, b1, scale)); + } + + if (x < length) + { + x = length - nlanes; + continue; // process one more time (unaligned tail) + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int mul_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[], + const int length, double scale) +{ + hal::mul8u(in1, static_cast(length), in2, static_cast(length), + out, static_cast(length), length, 1, &scale); + return length; +} + +#define MUL_SIMD(SRC, DST) \ +int mul_simd(const SRC in1[], const SRC in2[], DST out[], \ + const int length, double _scale) \ +{ \ + int x = 0; \ + float fscale = static_cast(_scale); \ + if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \ + { \ + not_scale_tag t; \ + x = mul_hal(t, in1, in2, out, length, _scale); \ + } \ + else \ + { \ + scale_tag t; \ + x = mul_hal(t, in1, in2, out, length, _scale); \ + } \ + return x; \ +} + +MUL_SIMD(uchar, uchar) +MUL_SIMD(ushort, uchar) +MUL_SIMD(short, uchar) +MUL_SIMD(float, uchar) +MUL_SIMD(short, short) +MUL_SIMD(ushort, short) +MUL_SIMD(uchar, short) +MUL_SIMD(float, short) +MUL_SIMD(ushort, ushort) +MUL_SIMD(uchar, ushort) +MUL_SIMD(short, ushort) +MUL_SIMD(float, ushort) +MUL_SIMD(uchar, float) +MUL_SIMD(ushort, float) +MUL_SIMD(short, float) +MUL_SIMD(float, float) + +#undef MUL_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END