From: Anna Khakimova Date: Fri, 3 Dec 2021 12:30:39 +0000 (+0300) Subject: Merge pull request #21177 from anna-khakimova:ak/simd_mulc X-Git-Tag: accepted/tizen/unified/20230127.161057~1^2~492 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c3910807c5a4ad0b32dac5881bc8054036a9225f;p=platform%2Fupstream%2Fopencv.git Merge pull request #21177 from anna-khakimova:ak/simd_mulc * GAPI Fluid: SIMD for MulC kernel. * Changes for MulDouble kernel. --- diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index 97b12f8..4084ed3 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -33,8 +33,8 @@ namespace opencv_test class SubCPerfTest : public TestPerfParams> {}; class SubRCPerfTest : public TestPerfParams> {}; class MulPerfTest : public TestPerfParams> {}; - class MulDoublePerfTest : public TestPerfParams> {}; - class MulCPerfTest : public TestPerfParams> {}; + class MulDoublePerfTest : public TestPerfParams> {}; + class MulCPerfTest : public TestPerfParams> {}; class DivPerfTest : public TestPerfParams> {}; class DivCPerfTest : public TestPerfParams> {}; class DivRCPerfTest : public TestPerfParams> {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index 6c286a5..d4144cd 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -257,17 +257,21 @@ PERF_TEST_P_(MulPerfTest, TestPerformance) PERF_TEST_P_(MulDoublePerfTest, TestPerformance) { - Size sz = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - int dtype = get<2>(GetParam()); - cv::GCompileArgs compile_args = get<3>(GetParam()); + compare_f cmpF; + cv::Size sz; + MatType type = -1; + int dtype = -1; + double scale = 1.0; + cv::GCompileArgs compile_args; + + std::tie(cmpF, sz, type, dtype, compile_args) = GetParam(); auto& rng = cv::theRNG(); double d = rng.uniform(0.0, 10.0); initMatrixRandU(type, sz, dtype, false); // OpenCV code /////////////////////////////////////////////////////////// - cv::multiply(in_mat1, d, out_mat_ocv, 1, dtype); + cv::multiply(in_mat1, d, out_mat_ocv, scale, dtype); // G-API code //////////////////////////////////////////////////////////// cv::GMat in1, out; @@ -285,8 +289,9 @@ PERF_TEST_P_(MulDoublePerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } @@ -295,15 +300,19 @@ PERF_TEST_P_(MulDoublePerfTest, TestPerformance) PERF_TEST_P_(MulCPerfTest, TestPerformance) { - Size sz = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - int dtype = get<2>(GetParam()); - cv::GCompileArgs compile_args = get<3>(GetParam()); + compare_f cmpF; + cv::Size sz; + MatType type = -1; + int dtype = -1; + double scale = 1.0; + cv::GCompileArgs compile_args; + + std::tie(cmpF, sz, type, dtype, compile_args) = GetParam(); initMatsRandU(type, sz, dtype, false); // OpenCV code /////////////////////////////////////////////////////////// - cv::multiply(in_mat1, sc, out_mat_ocv, 1, dtype); + cv::multiply(in_mat1, sc, out_mat_ocv, scale, dtype); // G-API code //////////////////////////////////////////////////////////// cv::GMat in1, out; @@ -322,8 +331,9 @@ PERF_TEST_P_(MulCPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 31e9d25..1255f5c 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -56,13 +56,15 @@ INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest, Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), - Values(-1, CV_8U, CV_16U, CV_32F), - Values(cv::compile_args(CORE_CPU)))); + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_32F), + Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(MulCPerfTestCPU, MulCPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(-1, CV_8U, CV_16U, CV_32F), Values(cv::compile_args(CORE_CPU)))); diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index 6ebd92d..058cff6 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -52,17 +52,19 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest, Values(2.0), Values(cv::compile_args(CORE_FLUID)))); -// INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest, -// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), -// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), -// Values(-1, CV_8U, CV_16U, CV_32F), -// Values(cv::compile_args(CORE_FLUID)))); + INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest, + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_32F), + Values(cv::compile_args(CORE_FLUID)))); -// INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest, -// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), -// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), -// Values(-1, CV_8U, CV_16U, CV_32F), -// Values(cv::compile_args(CORE_FLUID)))); + INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest, + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), + Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest, Combine(Values(AbsExact().to_compare_f()), diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index b4207c2..025ea53 100644 --- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -54,13 +54,15 @@ INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsExact().to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsExact().to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( -1, CV_8U, CV_16U, CV_32F ), Values(cv::compile_args(CORE_GPU)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index a737ad6..a0513a0 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -1265,12 +1265,12 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca { case ARITHM_ADD: { - int w = 0; + int w = 0; #if CV_SIMD - w = addc_simd(in, scalar, out, length, chan); + w = addc_simd(in, scalar, out, length, chan); #endif - for (; w < length; ++w) - out[w] = add(in[w], scalar[w % chan]); + for (; w < length; ++w) + out[w] = add(in[w], scalar[w % chan]); break; } @@ -1284,12 +1284,17 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca out[w] = sub(in[w], scalar[w % chan]); break; } - // TODO: optimize miltiplication and division case ARITHM_MULTIPLY: - for (int w=0; w < width; w++) - for (int c=0; c < chan; c++) - out[chan*w + c] = mul(in[chan*w + c], scalar[c], scale); + { + int w = 0; +#if CV_SIMD + w = mulc_simd(in, scalar, out, length, chan, scale); +#endif + for (; w < width; ++w) + for (int c = 0; c < chan; ++c) + out[chan * w + c] = mul(in[chan * w + c], scalar[c], scale); break; + } case ARITHM_DIVIDE: for (int w=0; w < width; w++) for (int c=0; c < chan; c++) @@ -1539,45 +1544,73 @@ GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false) } }; -GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false) +GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst) + static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/, + Buffer& dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; - const float scale = 1.f; + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* sc = scratch.OutLine(); + + for (int i = 0; i < scratch.length(); ++i) + sc[i] = static_cast(_scalar[i % chan]); + } + const float* scalar = scratch.OutLine(); + const float scale = 1.0; // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); - UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); - UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); - UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); - UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); - UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); - UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(float, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); + UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch) + { + initScratchBuffer(scratch); + } + + static void resetScratch(Buffer& /*scratch*/) + { + } }; -GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false) +GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true) { static const int Window = 1; - static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst) + static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar), - static_cast(_scalar), - static_cast(_scalar), - static_cast(_scalar) - }; + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + float* sc = scratch.OutLine(); + + for (int i = 0; i < scratch.length(); ++i) + sc[i] = static_cast(_scalar); + } + const float* scalar = scratch.OutLine(); const float scale = 1.f; // DST SRC OP __VA_ARGS__ @@ -1591,6 +1624,15 @@ GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false) CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GMatDesc&, double, int, Buffer& scratch) + { + initScratchBuffer(scratch); + } + + static void resetScratch(Buffer& /*scratch*/) + { + } }; GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 668ac3a..f596779 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -138,6 +138,33 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale) \ +{ \ + CV_CPU_DISPATCH(mulc_simd, (in, scalar, out, length, chan, scale), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +MULC_SIMD(uchar, uchar) +MULC_SIMD(ushort, uchar) +MULC_SIMD(short, uchar) +MULC_SIMD(float, uchar) +MULC_SIMD(short, short) +MULC_SIMD(ushort, short) +MULC_SIMD(uchar, short) +MULC_SIMD(float, short) +MULC_SIMD(ushort, ushort) +MULC_SIMD(uchar, ushort) +MULC_SIMD(short, ushort) +MULC_SIMD(float, ushort) +MULC_SIMD(uchar, float) +MULC_SIMD(ushort, float) +MULC_SIMD(short, float) +MULC_SIMD(float, float) + +#undef MULC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index e6c0d4f..541870e 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -106,6 +106,29 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale); + +MULC_SIMD(uchar, uchar) +MULC_SIMD(ushort, uchar) +MULC_SIMD(short, uchar) +MULC_SIMD(float, uchar) +MULC_SIMD(short, short) +MULC_SIMD(ushort, short) +MULC_SIMD(uchar, short) +MULC_SIMD(float, short) +MULC_SIMD(ushort, ushort) +MULC_SIMD(uchar, ushort) +MULC_SIMD(short, ushort) +MULC_SIMD(float, ushort) +MULC_SIMD(uchar, float) +MULC_SIMD(ushort, float) +MULC_SIMD(short, float) +MULC_SIMD(float, float) + +#undef MULC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index aed5359..4597413 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -127,6 +127,30 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD + +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale); + +MULC_SIMD(uchar, uchar) +MULC_SIMD(ushort, uchar) +MULC_SIMD(short, uchar) +MULC_SIMD(float, uchar) +MULC_SIMD(short, short) +MULC_SIMD(ushort, short) +MULC_SIMD(uchar, short) +MULC_SIMD(float, short) +MULC_SIMD(ushort, ushort) +MULC_SIMD(uchar, ushort) +MULC_SIMD(short, ushort) +MULC_SIMD(float, ushort) +MULC_SIMD(uchar, float) +MULC_SIMD(ushort, float) +MULC_SIMD(short, float) +MULC_SIMD(float, float) + +#undef MULC_SIMD + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -870,12 +894,13 @@ MUL_SIMD(float, float) //------------------------- // -// Fluid kernels: AddC +// Fluid kernels: AddC, SubC // //------------------------- struct add_tag {}; struct sub_tag {}; +struct mul_tag {}; CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1, const v_int32& c2, const v_int32& c3, @@ -909,6 +934,12 @@ CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc return a - sc; } +CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc) +{ + return a * sc; +} +//------------------------------------------------------------------------------------------------- + template CV_ALWAYS_INLINE typename std::enable_if<(std::is_same::value || @@ -957,7 +988,7 @@ CV_ALWAYS_INLINE typename std::enable_if::value || std::is_same::value, void>::type arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, - const v_float32& s3, const int nlanes) + const v_float32& s3, const int nlanes) { v_float32 a1 = vg_load_f32(inx); v_float32 a2 = vg_load_f32(&inx[nlanes / 2]); @@ -1089,7 +1120,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[], return x; } - +//------------------------------------------------------------------------------------------------- #define ADDC_SIMD(SRC, DST) \ int addc_simd(const SRC in[], const float scalar[], DST out[], \ @@ -1129,6 +1160,8 @@ ADDC_SIMD(float, float) #undef ADDC_SIMD +//------------------------------------------------------------------------------------------------- + #define SUBC_SIMD(SRC, DST) \ int subc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan) \ @@ -1167,6 +1200,256 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD +//------------------------- +// +// Fluid kernels: MulC +// +//------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& scale, const int nlanes) +{ + v_float32 a1 = vg_load_f32(inx); + v_float32 a2 = vg_load_f32(&inx[nlanes / 2]); + v_float32 a3 = vg_load_f32(&inx[nlanes]); + v_float32 a4 = vg_load_f32(&inx[3 * nlanes / 2]); + v_float32 a5 = vg_load_f32(&inx[2 * nlanes]); + v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]); + + arithmOpScalar_pack_store_c3(outx, v_round(scale*a1*s1), + v_round(scale*a2*s2), + v_round(scale*a3*s3), + v_round(scale*a4*s1), + v_round(scale*a5*s2), + v_round(scale*a6*s3)); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& scale, const int nlanes) +{ + vx_store(outx, + v_pack_u(v_pack(v_round(scale * vg_load_f32(inx)* s1), + v_round(scale * vg_load_f32(&inx[nlanes/4])* s2)), + v_pack(v_round(scale * vg_load_f32(&inx[nlanes/2])* s3), + v_round(scale * vg_load_f32(&inx[3*nlanes/4])* s1)))); + + vx_store(&outx[nlanes], + v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[nlanes])* s2), + v_round(scale * vg_load_f32(&inx[5*nlanes/4])* s3)), + v_pack(v_round(scale * vg_load_f32(&inx[3*nlanes/2])* s1), + v_round(scale * vg_load_f32(&inx[7*nlanes/4])* s2)))); + + vx_store(&outx[2 * nlanes], + v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[2*nlanes])* s3), + v_round(scale * vg_load_f32(&inx[9*nlanes/4])* s1)), + v_pack(v_round(scale * vg_load_f32(&inx[5*nlanes/2])* s2), + v_round(scale * vg_load_f32(&inx[11*nlanes/4])* s3)))); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* in, float* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& scale, const int nlanes) +{ + v_float32 a1 = vg_load_f32(in); + v_float32 a2 = vg_load_f32(&in[nlanes]); + v_float32 a3 = vg_load_f32(&in[2*nlanes]); + + vx_store(out, scale * a1* s1); + vx_store(&out[nlanes], scale * a2* s2); + vx_store(&out[2*nlanes], scale * a3* s3); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[], + const float scalar[], DST out[], + const int length, const float _scale) +{ + constexpr int chan = 3; + constexpr int nlanes = vector_type_of_t::nlanes; + constexpr int lanes = chan * nlanes; + + if (length < lanes) + return 0; + + v_float32 scale = vx_setall_f32(_scale); + + v_float32 s1 = vx_load(scalar); +#if CV_SIMD_WIDTH == 32 + v_float32 s2 = vx_load(&scalar[2]); + v_float32 s3 = vx_load(&scalar[1]); +#else + v_float32 s2 = vx_load(&scalar[1]); + v_float32 s3 = vx_load(&scalar[2]); +#endif + + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + mulc_scale_simd_c3_impl(&in[x], &out[x], s1, s2, s3, scale, nlanes); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value || + std::is_same::value), void>::type +mulc_scale_simd_common_impl(const SRC* inx, DST* outx, + const v_float32& sc, const v_float32& scale, + const int nlanes) +{ + v_float32 a1 = vg_load_f32(inx); + v_float32 a2 = vg_load_f32(&inx[nlanes/2]); + + v_store_i16(outx, v_round(scale * a1* sc), v_round(scale * a2* sc)); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx, + uchar* outx, const v_float32& sc, + const v_float32& scale, const int nlanes) +{ + v_float32 a1 = vg_load_f32(inx); + v_float32 a2 = vg_load_f32(&inx[nlanes/4]); + v_float32 a3 = vg_load_f32(&inx[nlanes/2]); + v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]); + + vx_store(outx, v_pack_u(v_pack(v_round(scale * a1* sc), + v_round(scale * a2* sc)), + v_pack(v_round(scale * a3* sc), + v_round(scale * a4* sc)))); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx, + float* outx, const v_float32& sc, + const v_float32& scale, const int) +{ + v_float32 a1 = vg_load_f32(inx); + vx_store(outx, scale * a1* sc); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[], + const float scalar[], DST out[], + const int length, const float _scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 _scalar = vx_load(scalar); + v_float32 scale = vx_setall_f32(_scale); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + mulc_scale_simd_common_impl(&in[x], &out[x], _scalar, scale, nlanes); + } + + if (x < length) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale) \ +{ \ + mul_tag op_t; \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + { \ + if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \ + { \ + return arithmOpScalar_simd_common(op_t, in, scalar, \ + out, length); \ + } \ + else \ + { \ + return mulc_scale_simd_common(in, scalar, out, length, scale); \ + } \ + } \ + case 3: \ + { \ + if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \ + { \ + return arithmOpScalar_simd_c3(op_t, in, scalar, \ + out, length); \ + } \ + else \ + { \ + return mulc_scale_simd_c3(in, scalar, out, length, scale); \ + } \ + } \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +MULC_SIMD(uchar, uchar) +MULC_SIMD(ushort, uchar) +MULC_SIMD(short, uchar) +MULC_SIMD(float, uchar) +MULC_SIMD(short, short) +MULC_SIMD(ushort, short) +MULC_SIMD(uchar, short) +MULC_SIMD(float, short) +MULC_SIMD(ushort, ushort) +MULC_SIMD(uchar, ushort) +MULC_SIMD(short, ushort) +MULC_SIMD(float, ushort) +MULC_SIMD(uchar, float) +MULC_SIMD(ushort, float) +MULC_SIMD(short, float) +MULC_SIMD(float, float) + +#undef MULC_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END