* GAPI Fluid: SIMD for MulC kernel.
* Changes for MulDouble kernel.
class SubCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
- class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
- class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+ class MulDoublePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
+ class MulCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
{
- Size sz = get<0>(GetParam());
- MatType type = get<1>(GetParam());
- int dtype = get<2>(GetParam());
- cv::GCompileArgs compile_args = get<3>(GetParam());
+ compare_f cmpF;
+ cv::Size sz;
+ MatType type = -1;
+ int dtype = -1;
+ double scale = 1.0;
+ cv::GCompileArgs compile_args;
+
+ std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();
auto& rng = cv::theRNG();
double d = rng.uniform(0.0, 10.0);
initMatrixRandU(type, sz, dtype, false);
// OpenCV code ///////////////////////////////////////////////////////////
- cv::multiply(in_mat1, d, out_mat_ocv, 1, dtype);
+ cv::multiply(in_mat1, d, out_mat_ocv, scale, dtype);
// G-API code ////////////////////////////////////////////////////////////
cv::GMat in1, out;
}
// Comparison ////////////////////////////////////////////////////////////
- // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
- EXPECT_EQ(out_mat_gapi.size(), sz);
+ {
+ EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+ }
SANITY_CHECK_NOTHING();
}
PERF_TEST_P_(MulCPerfTest, TestPerformance)
{
- Size sz = get<0>(GetParam());
- MatType type = get<1>(GetParam());
- int dtype = get<2>(GetParam());
- cv::GCompileArgs compile_args = get<3>(GetParam());
+ compare_f cmpF;
+ cv::Size sz;
+ MatType type = -1;
+ int dtype = -1;
+ double scale = 1.0;
+ cv::GCompileArgs compile_args;
+
+ std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();
initMatsRandU(type, sz, dtype, false);
// OpenCV code ///////////////////////////////////////////////////////////
- cv::multiply(in_mat1, sc, out_mat_ocv, 1, dtype);
+ cv::multiply(in_mat1, sc, out_mat_ocv, scale, dtype);
// G-API code ////////////////////////////////////////////////////////////
cv::GMat in1, out;
}
// Comparison ////////////////////////////////////////////////////////////
- // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
- EXPECT_EQ(out_mat_gapi.size(), sz);
+ {
+ EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+ }
SANITY_CHECK_NOTHING();
}
Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,
- Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
- Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
- Values(-1, CV_8U, CV_16U, CV_32F),
- Values(cv::compile_args(CORE_CPU))));
+ Combine(Values(AbsExact().to_compare_f()),
+ Values(szSmall128, szVGA, sz720p, sz1080p),
+ Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+ Values(-1, CV_8U, CV_16U, CV_32F),
+ Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(MulCPerfTestCPU, MulCPerfTest,
- Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+ Combine(Values(AbsExact().to_compare_f()),
+ Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(-1, CV_8U, CV_16U, CV_32F),
Values(cv::compile_args(CORE_CPU))));
Values(2.0),
Values(cv::compile_args(CORE_FLUID))));
-// INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
-// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-// Values(-1, CV_8U, CV_16U, CV_32F),
-// Values(cv::compile_args(CORE_FLUID))));
+ INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
+ Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
+ Values(szSmall128, szVGA, sz720p, sz1080p),
+ Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1),
+ Values(-1, CV_8U, CV_32F),
+ Values(cv::compile_args(CORE_FLUID))));
-// INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest,
-// Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-// Values(-1, CV_8U, CV_16U, CV_32F),
-// Values(cv::compile_args(CORE_FLUID))));
+ INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest,
+ Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
+ Values(szSmall128, szVGA, sz720p, sz1080p),
+ Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+ Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
+ Values(cv::compile_args(CORE_FLUID))));
INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,
- Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+ Combine(Values(AbsExact().to_compare_f()),
+ Values( szSmall128, szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
Values( -1, CV_8U, CV_16U, CV_32F ),
Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
- Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+ Combine(Values(AbsExact().to_compare_f()),
+ Values( szSmall128, szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
Values( -1, CV_8U, CV_16U, CV_32F ),
Values(cv::compile_args(CORE_GPU))));
{
case ARITHM_ADD:
{
- int w = 0;
+ int w = 0;
#if CV_SIMD
- w = addc_simd(in, scalar, out, length, chan);
+ w = addc_simd(in, scalar, out, length, chan);
#endif
- for (; w < length; ++w)
- out[w] = add<DST>(in[w], scalar[w % chan]);
+ for (; w < length; ++w)
+ out[w] = add<DST>(in[w], scalar[w % chan]);
break;
}
out[w] = sub<DST>(in[w], scalar[w % chan]);
break;
}
- // TODO: optimize miltiplication and division
case ARITHM_MULTIPLY:
- for (int w=0; w < width; w++)
- for (int c=0; c < chan; c++)
- out[chan*w + c] = mul<DST>(in[chan*w + c], scalar[c], scale);
+ {
+ int w = 0;
+#if CV_SIMD
+ w = mulc_simd(in, scalar, out, length, chan, scale);
+#endif
+ for (; w < width; ++w)
+ for (int c = 0; c < chan; ++c)
+ out[chan * w + c] = mul<DST>(in[chan * w + c], scalar[c], scale);
break;
+ }
case ARITHM_DIVIDE:
for (int w=0; w < width; w++)
for (int c=0; c < chan; c++)
}
};
-GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false)
+GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true)
{
static const int Window = 1;
- static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
+ static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/,
+ Buffer& dst, Buffer& scratch)
{
- const float scalar[4] = {
- static_cast<float>(_scalar[0]),
- static_cast<float>(_scalar[1]),
- static_cast<float>(_scalar[2]),
- static_cast<float>(_scalar[3])
- };
- const float scale = 1.f;
+ GAPI_Assert(src.meta().chan <= 4);
+
+ if (dst.y() == 0)
+ {
+ const int chan = src.meta().chan;
+ float* sc = scratch.OutLine<float>();
+
+ for (int i = 0; i < scratch.length(); ++i)
+ sc[i] = static_cast<float>(_scalar[i % chan]);
+ }
+ const float* scalar = scratch.OutLine<float>();
+ const float scale = 1.0;
// DST SRC OP __VA_ARGS__
- UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
- UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
- UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
- UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
- UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
- UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
- UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(float, uchar, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+ UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
+
+ static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
+ {
+ initScratchBuffer(scratch);
+ }
+
+ static void resetScratch(Buffer& /*scratch*/)
+ {
+ }
};
-GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false)
+GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true)
{
static const int Window = 1;
- static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst)
+ static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst, Buffer& scratch)
{
- const float scalar[4] = {
- static_cast<float>(_scalar),
- static_cast<float>(_scalar),
- static_cast<float>(_scalar),
- static_cast<float>(_scalar)
- };
+ GAPI_Assert(src.meta().chan <= 4);
+
+ if (dst.y() == 0)
+ {
+ float* sc = scratch.OutLine<float>();
+
+ for (int i = 0; i < scratch.length(); ++i)
+ sc[i] = static_cast<float>(_scalar);
+ }
+ const float* scalar = scratch.OutLine<float>();
const float scale = 1.f;
// DST SRC OP __VA_ARGS__
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
+
+ static void initScratch(const GMatDesc&, double, int, Buffer& scratch)
+ {
+ initScratchBuffer(scratch);
+ }
+
+ static void resetScratch(Buffer& /*scratch*/)
+ {
+ }
};
GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false)
#undef SUBC_SIMD
+#define MULC_SIMD(SRC, DST) \
+int mulc_simd(const SRC in[], const float scalar[], DST out[], \
+ const int length, const int chan, const float scale) \
+{ \
+ CV_CPU_DISPATCH(mulc_simd, (in, scalar, out, length, chan, scale), \
+ CV_CPU_DISPATCH_MODES_ALL); \
+}
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
} // namespace fluid
} // namespace gapi
} // namespace cv
#undef SUBC_SIMD
+#define MULC_SIMD(SRC, DST) \
+int mulc_simd(const SRC in[], const float scalar[], DST out[], \
+ const int length, const int chan, const float scale);
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
} // namespace fluid
} // namespace gapi
} // namespace cv
#undef SUBC_SIMD
+
+#define MULC_SIMD(SRC, DST) \
+int mulc_simd(const SRC in[], const float scalar[], DST out[], \
+ const int length, const int chan, const float scale);
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct scale_tag {};
//-------------------------
//
-// Fluid kernels: AddC
+// Fluid kernels: AddC, SubC
//
//-------------------------
struct add_tag {};
struct sub_tag {};
+struct mul_tag {};
CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1,
const v_int32& c2, const v_int32& c3,
return a - sc;
}
+CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
+{
+ return a * sc;
+}
+//-------------------------------------------------------------------------------------------------
+
template<typename oper_tag, typename SRC, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<(std::is_same<DST, ushort>::value ||
typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, void>::type
arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
- const v_float32& s3, const int nlanes)
+ const v_float32& s3, const int nlanes)
{
v_float32 a1 = vg_load_f32(inx);
v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
return x;
}
-
+//-------------------------------------------------------------------------------------------------
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
#undef ADDC_SIMD
+//-------------------------------------------------------------------------------------------------
+
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan) \
#undef SUBC_SIMD
+//-------------------------
+//
+// Fluid kernels: MulC
+//
+//-------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+ std::is_same<DST, ushort>::value, void>::type
+mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& scale, const int nlanes)
+{
+ v_float32 a1 = vg_load_f32(inx);
+ v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
+ v_float32 a3 = vg_load_f32(&inx[nlanes]);
+ v_float32 a4 = vg_load_f32(&inx[3 * nlanes / 2]);
+ v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
+ v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);
+
+ arithmOpScalar_pack_store_c3(outx, v_round(scale*a1*s1),
+ v_round(scale*a2*s2),
+ v_round(scale*a3*s3),
+ v_round(scale*a4*s1),
+ v_round(scale*a5*s2),
+ v_round(scale*a6*s3));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* inx, uchar* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& scale, const int nlanes)
+{
+ vx_store(outx,
+ v_pack_u(v_pack(v_round(scale * vg_load_f32(inx)* s1),
+ v_round(scale * vg_load_f32(&inx[nlanes/4])* s2)),
+ v_pack(v_round(scale * vg_load_f32(&inx[nlanes/2])* s3),
+ v_round(scale * vg_load_f32(&inx[3*nlanes/4])* s1))));
+
+ vx_store(&outx[nlanes],
+ v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[nlanes])* s2),
+ v_round(scale * vg_load_f32(&inx[5*nlanes/4])* s3)),
+ v_pack(v_round(scale * vg_load_f32(&inx[3*nlanes/2])* s1),
+ v_round(scale * vg_load_f32(&inx[7*nlanes/4])* s2))));
+
+ vx_store(&outx[2 * nlanes],
+ v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[2*nlanes])* s3),
+ v_round(scale * vg_load_f32(&inx[9*nlanes/4])* s1)),
+ v_pack(v_round(scale * vg_load_f32(&inx[5*nlanes/2])* s2),
+ v_round(scale * vg_load_f32(&inx[11*nlanes/4])* s3))));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* in, float* out,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& scale, const int nlanes)
+{
+ v_float32 a1 = vg_load_f32(in);
+ v_float32 a2 = vg_load_f32(&in[nlanes]);
+ v_float32 a3 = vg_load_f32(&in[2*nlanes]);
+
+ vx_store(out, scale * a1* s1);
+ vx_store(&out[nlanes], scale * a2* s2);
+ vx_store(&out[2*nlanes], scale * a3* s3);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],
+ const float scalar[], DST out[],
+ const int length, const float _scale)
+{
+ constexpr int chan = 3;
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+ constexpr int lanes = chan * nlanes;
+
+ if (length < lanes)
+ return 0;
+
+ v_float32 scale = vx_setall_f32(_scale);
+
+ v_float32 s1 = vx_load(scalar);
+#if CV_SIMD_WIDTH == 32
+ v_float32 s2 = vx_load(&scalar[2]);
+ v_float32 s3 = vx_load(&scalar[1]);
+#else
+ v_float32 s2 = vx_load(&scalar[1]);
+ v_float32 s3 = vx_load(&scalar[2]);
+#endif
+
+ int x = 0;
+ for (;;)
+ {
+ for (; x <= length - lanes; x += lanes)
+ {
+ mulc_scale_simd_c3_impl(&in[x], &out[x], s1, s2, s3, scale, nlanes);
+ }
+
+ if (x < length)
+ {
+ x = length - lanes;
+ continue; // process unaligned tail
+ }
+ break;
+ }
+ return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<(std::is_same<DST, ushort>::value ||
+ std::is_same<DST, short>::value), void>::type
+mulc_scale_simd_common_impl(const SRC* inx, DST* outx,
+ const v_float32& sc, const v_float32& scale,
+ const int nlanes)
+{
+ v_float32 a1 = vg_load_f32(inx);
+ v_float32 a2 = vg_load_f32(&inx[nlanes/2]);
+
+ v_store_i16(outx, v_round(scale * a1* sc), v_round(scale * a2* sc));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
+ uchar* outx, const v_float32& sc,
+ const v_float32& scale, const int nlanes)
+{
+ v_float32 a1 = vg_load_f32(inx);
+ v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
+ v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
+ v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);
+
+ vx_store(outx, v_pack_u(v_pack(v_round(scale * a1* sc),
+ v_round(scale * a2* sc)),
+ v_pack(v_round(scale * a3* sc),
+ v_round(scale * a4* sc))));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
+ float* outx, const v_float32& sc,
+ const v_float32& scale, const int)
+{
+ v_float32 a1 = vg_load_f32(inx);
+ vx_store(outx, scale * a1* sc);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[],
+ const float scalar[], DST out[],
+ const int length, const float _scale)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+ if (length < nlanes)
+ return 0;
+
+ v_float32 _scalar = vx_load(scalar);
+ v_float32 scale = vx_setall_f32(_scale);
+
+ int x = 0;
+ for (;;)
+ {
+ for (; x <= length - nlanes; x += nlanes)
+ {
+ mulc_scale_simd_common_impl(&in[x], &out[x], _scalar, scale, nlanes);
+ }
+
+ if (x < length)
+ {
+ x = length - nlanes;
+ continue; // process unaligned tail
+ }
+ break;
+ }
+ return x;
+}
+
+#define MULC_SIMD(SRC, DST) \
+int mulc_simd(const SRC in[], const float scalar[], DST out[], \
+ const int length, const int chan, const float scale) \
+{ \
+ mul_tag op_t; \
+ switch (chan) \
+ { \
+ case 1: \
+ case 2: \
+ case 4: \
+ { \
+ if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
+ { \
+ return arithmOpScalar_simd_common(op_t, in, scalar, \
+ out, length); \
+ } \
+ else \
+ { \
+ return mulc_scale_simd_common(in, scalar, out, length, scale); \
+ } \
+ } \
+ case 3: \
+ { \
+ if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
+ { \
+ return arithmOpScalar_simd_c3(op_t, in, scalar, \
+ out, length); \
+ } \
+ else \
+ { \
+ return mulc_scale_simd_c3(in, scalar, out, length, scale); \
+ } \
+ } \
+ default: \
+ GAPI_Assert(chan <= 4); \
+ break; \
+ } \
+ return 0; \
+}
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END