#undef SUBRC_SIMD
-#define MULC_SIMD(SRC, DST) \
-int mulc_simd(const SRC in[], const float scalar[], DST out[], \
+#define MULC_SIMD(SRC, DST) \
+int mulc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan, const float scale);
MULC_SIMD(uchar, uchar)
#undef MULC_SIMD
-#define DIVC_SIMD(SRC, DST) \
-int divc_simd(const SRC in[], const float scalar[], DST out[], \
- const int length, const int chan, const float scale, \
+#define DIVC_SIMD(SRC, DST) \
+int divc_simd(const SRC in[], const float scalar[], DST out[], \
+ const int length, const int chan, const float scale, \
const int set_mask_flag);
DIVC_SIMD(uchar, uchar)
#undef ABSDIFFC_SIMD
+#define DIVRC_SIMD(SRC, DST) \
+int divrc_simd(const float scalar[], const SRC in[], DST out[], \
+ const int length, const int chan, const float scale);
+
+DIVRC_SIMD(uchar, uchar)
+DIVRC_SIMD(ushort, uchar)
+DIVRC_SIMD(short, uchar)
+DIVRC_SIMD(float, uchar)
+DIVRC_SIMD(short, short)
+DIVRC_SIMD(ushort, short)
+DIVRC_SIMD(uchar, short)
+DIVRC_SIMD(float, short)
+DIVRC_SIMD(ushort, ushort)
+DIVRC_SIMD(uchar, ushort)
+DIVRC_SIMD(short, ushort)
+DIVRC_SIMD(float, ushort)
+DIVRC_SIMD(uchar, float)
+DIVRC_SIMD(ushort, float)
+DIVRC_SIMD(short, float)
+DIVRC_SIMD(float, float)
+
+#undef DIVRC_SIMD
+
int split3_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], const int width);
template<> struct vector_type_of<short> { using type = v_int16; };
template<> struct vector_type_of<float> { using type = v_float32; };
+template<typename scalar_t>
+struct zero_vec_type_of;
+
+template<typename scalar_t>
+using zero_vec_type_of_t = typename zero_vec_type_of<scalar_t>::type;
+
+template<> struct zero_vec_type_of<uchar> { using type = v_int16; };
+template<> struct zero_vec_type_of<ushort> { using type = v_int16; };
+template<> struct zero_vec_type_of<short> { using type = v_int16; };
+template<> struct zero_vec_type_of<float> { using type = v_float32; };
+
+template<typename scalar_t>
+struct univ_zero_vec_type_of;
+
+template<typename scalar_t>
+using univ_zero_vec_type_of_t = typename univ_zero_vec_type_of<scalar_t>::type;
+
+template<> struct univ_zero_vec_type_of<uchar> { using type = v_uint8; };
+template<> struct univ_zero_vec_type_of<ushort> { using type = v_int16; };
+template<> struct univ_zero_vec_type_of<short> { using type = v_int16; };
+template<> struct univ_zero_vec_type_of<float> { using type = v_float32; };
+
CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in)
{
return vx_load(in);
CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
const v_int32& res1, const v_int32& res2)
{
- v_uint16 sel = v_reinterpret_as_u16(v_select(div == v_zero, v_zero, v_pack(res1, res2)));
- vx_store(dst, sel);
+ vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero),
+ v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2)));
}
-//=================================================================================================
+//=============================================================================
-template<typename scale_tag_t, typename SRC, typename DST>
+template<typename scale_tag_t>
CV_ALWAYS_INLINE
-typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
- (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
- (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
-div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
+void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
+ const v_float32& a3, const v_float32& a4, const uchar* in2x,
+ uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
{
- constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-
- if (length < nlanes)
- return 0;
-
- v_int16 v_zero = vx_setall_s16(0);
- v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
-
- int x = 0;
- for (;;)
- {
- for (; x <= length - nlanes; x += nlanes)
- {
- v_float32 a1 = vg_load_f32(&in1[x]);
- v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
+ constexpr int nlanes = v_uint8::nlanes;
- v_int16 div = v_reinterpret_as_s16(vx_load(&in2[x]));
+ v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x));
+ v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2]));
- v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
- v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
+ v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
+ v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
- v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale));
- v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale));
+ v_int32 sum1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)),
+ sum2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)),
+ sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
+ sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
- v_store_select(&out[x], div, v_zero, r1, r2);
- }
+ v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
+ v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
- if (x < length)
- {
- x = length - nlanes;
- continue; // process one more time (unaligned tail)
- }
- break;
- }
- return x;
+ vx_store(outx, v_pack_u(res1, res2));
}
-//-------------------------------------------------------------------------------------------------
-
template<typename scale_tag_t, typename SRC>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<SRC, short>::value ||
- std::is_same<SRC, ushort>::value, int>::type
-div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
+ std::is_same<SRC, ushort>::value, void>::type
+div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
+ const v_float32& a3, const v_float32& a4, const SRC* in2x,
+ uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
{
constexpr int nlanes = v_uint8::nlanes;
- if (length < nlanes)
- return 0;
+ v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x));
+ v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2]));
- v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
- v_int16 v_zero = vx_setall_s16(0);
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
+ v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
+ v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
- int x = 0;
- for (;;)
- {
- for (; x <= length - nlanes; x += nlanes)
- {
- v_float32 a1 = vg_load_f32(&in1[x]);
- v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
- v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
- v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
-
- v_int16 div1 = v_reinterpret_as_s16(vx_load(&in2[x]));
- v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2]));
-
- v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
- v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
- v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
- v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
+ v_int32 sum1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)),
+ sum2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)),
+ sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
+ sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
- v_int32 sum1 = v_round(div_op(t, a1, fdiv1, scale)),
- sum2 = v_round(div_op(t, a2, fdiv2, scale)),
- sum3 = v_round(div_op(t, a3, fdiv3, scale)),
- sum4 = v_round(div_op(t, a4, fdiv4, scale));
+ v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
+ v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
- v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
- v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
-
- vx_store(&out[x], v_pack_u(res1, res2));
- }
-
- if (x < length)
- {
- x = length - nlanes;
- continue; // process one more time (unaligned tail)
- }
- break;
- }
- return x;
+ vx_store(outx, v_pack_u(res1, res2));
}
-//-------------------------------------------------------------------------------------------------
-
template<typename scale_tag_t>
-CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
- const int length, double _scale)
+CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
+ const v_float32& a2, const v_float32& a3,
+ const v_float32& a4, const float* in2x, uchar* outx,
+ const v_float32& v_scale, const v_float32& v_zero)
{
constexpr int nlanes = v_uint8::nlanes;
- if (length < nlanes)
- return 0;
+ v_float32 div1 = vg_load_f32(in2x);
+ v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]);
+ v_float32 div3 = vg_load_f32(&in2x[nlanes / 2]);
+ v_float32 div4 = vg_load_f32(&in2x[3 * nlanes / 4]);
- v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
- v_float32 v_zero = vx_setall_f32(0);
- int x = 0;
- for (;;)
- {
- for (; x <= length - nlanes; x += nlanes)
- {
- v_float32 a1 = vg_load_f32(&in1[x]);
- v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]);
- v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]);
- v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]);
+ v_float32 r1 = div_op(s_tag, a1, div1, v_scale);
+ v_float32 r2 = div_op(s_tag, a2, div2, v_scale);
+ v_float32 r3 = div_op(s_tag, a3, div3, v_scale);
+ v_float32 r4 = div_op(s_tag, a4, div4, v_scale);
- v_float32 div1 = vg_load_f32(&in2[x]);
- v_float32 div2 = vg_load_f32(&in2[x + nlanes / 4]);
- v_float32 div3 = vg_load_f32(&in2[x + nlanes / 2]);
- v_float32 div4 = vg_load_f32(&in2[x + 3 * nlanes / 4]);
+ v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
+ v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
+ v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
+ v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
- v_float32 r1 = div_op(t, a1, div1, scale);
- v_float32 r2 = div_op(t, a2, div2, scale);
- v_float32 r3 = div_op(t, a3, div3, scale);
- v_float32 r4 = div_op(t, a4, div4, scale);
+ v_int32 res1 = v_round(sel1);
+ v_int32 res2 = v_round(sel2);
+ v_int32 res3 = v_round(sel3);
+ v_int32 res4 = v_round(sel4);
- v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
- v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
- v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
- v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
+ vx_store(outx, v_pack_u(v_pack(res1, res2), v_pack(res3, res4)));
+}
- v_int32 res1 = v_round(sel1);
- v_int32 res2 = v_round(sel2);
- v_int32 res3 = v_round(sel3);
- v_int32 res4 = v_round(sel4);
+template<typename scale_tag_t, typename SRC, typename Vtype>
+CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx,
+ const v_float32& v_scale, const Vtype& v_zero)
+{
+ constexpr int nlanes = v_uint8::nlanes;
- vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4)));
- }
+ v_float32 a1 = vg_load_f32(in1x);
+ v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]);
+ v_float32 a3 = vg_load_f32(&in1x[nlanes / 2]);
+ v_float32 a4 = vg_load_f32(&in1x[3 * nlanes / 4]);
- if (x < length)
- {
- x = length - nlanes;
- continue; // process one more time (unaligned tail)
- }
- break;
- }
- return x;
+ div_simd_impl(s_tag, a1, a2, a3, a4, in2x, outx, v_scale, v_zero);
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value ||
- std::is_same<DST, ushort>::value, int>::type
-div_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
+ std::is_same<DST, ushort>::value, void>::type
+div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
+ const uchar* in2x, DST* outx, const v_float32& v_scale,
+ const v_int16& v_zero)
{
- constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-
- if (length < nlanes)
- return 0;
+ v_int16 div = v_reinterpret_as_s16(vx_load_expand(in2x));
- v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
- v_int16 v_zero = vx_setall_s16(0);
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
- int x = 0;
- for (;;)
- {
- for (; x <= length - nlanes; x += nlanes)
- {
- v_float32 a1 = vg_load_f32(&in1[x]);
- v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
+ v_int32 r1 = v_round(div_op(s_tag, a1, fdiv1, v_scale));
+ v_int32 r2 = v_round(div_op(s_tag, a2, fdiv2, v_scale));
- v_int16 div = v_reinterpret_as_s16(vx_load_expand(&in2[x]));
+ v_store_select(outx, div, v_zero, r1, r2);
+}
- v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
- v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
+ (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
+ (std::is_same<SRC, short>::value && std::is_same<DST, short>::value) ||
+ (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), void>::type
+div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
+ const SRC* in2x, DST* outx, const v_float32& v_scale, const v_int16& v_zero)
+{
+ v_int16 div = v_reinterpret_as_s16(vx_load(in2x));
- v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale));
- v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale));
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div));
- v_store_select(&out[x], div, v_zero, r1, r2);
- }
+ v_int32 r1 = v_round(div_op(s_tag, a1, fdiv1, v_scale));
+ v_int32 r2 = v_round(div_op(s_tag, a2, fdiv2, v_scale));
- if (x < length)
- {
- x = length - nlanes;
- continue; // process one more time (unaligned tail)
- }
- break;
- }
- return x;
+ v_store_select(outx, div, v_zero, r1, r2);
}
-//-------------------------------------------------------------------------------------------------
-
template<typename scale_tag_t, typename DST>
CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value ||
- std::is_same<DST, ushort>::value, int>::type
-div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
+ std::is_same<DST, ushort>::value, void>::type
+div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
+ const float* in2x, DST* outx, const v_float32& v_scale,
+ const v_float32& v_zero)
{
constexpr int nlanes = vector_type_of_t<DST>::nlanes;
- if (length < nlanes)
- return 0;
+ v_float32 fdiv1 = vg_load_f32(in2x);
+ v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]);
- v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
- v_float32 v_zero = vx_setall_f32(0);
- int x = 0;
- for (;;)
- {
- for (; x <= length - nlanes; x += nlanes)
- {
- v_float32 a1 = vg_load_f32(&in1[x]);
- v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]);
+ v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale);
+ v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale);
- v_float32 fdiv1 = vg_load_f32(&in2[x]);
- v_float32 fdiv2 = vg_load_f32(&in2[x + nlanes / 2]);
+ v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
+ v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
- v_float32 r1 = div_op(t, a1, fdiv1, scale);
- v_float32 r2 = div_op(t, a2, fdiv2, scale);
+ v_store_i16(outx, res1, res2);
+}
- v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
- v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
+template<typename scale_tag_t, typename SRC, typename DST, typename Vtype>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+ std::is_same<DST, ushort>::value, void>::type
+div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx,
+ const v_float32& v_scale, const Vtype& v_zero)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
- v_store_i16(&out[x], res1, res2);
- }
+ v_float32 a1 = vg_load_f32(in1x);
+ v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]);
- if (x < length)
- {
- x = length - nlanes;
- continue; // process one more time (unaligned tail)
- }
- break;
- }
- return x;
+ div_simd_impl(s_tag, a1, a2, in2x, outx, v_scale, v_zero);
}
//-------------------------------------------------------------------------------------------------
template<typename scale_tag_t, typename SRC>
-CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
- const int length, double _scale)
+CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const SRC* in2x,
+ float* outx, const v_float32& v_scale)
{
- constexpr int nlanes = v_float32::nlanes;
+ v_float32 b1 = vg_load_f32(in2x);
+ vx_store(outx, div_op(s_tag, a1, b1, v_scale));
+}
+
+template<typename scale_tag_t, typename SRC, typename Tvec>
+CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, float* outx,
+ const v_float32& v_scale, const Tvec&)
+{
+ v_float32 a1 = vg_load_f32(in1x);
+ div_simd_impl(s_tag, a1, in2x, outx, v_scale);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[],
+ DST out[], const int length, float scale)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
if (length < nlanes)
return 0;
- v_float32 scale = vx_setall_f32(static_cast<float>(_scale));
+ const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+ v_float32 v_scale = vx_setall_f32(scale);
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
- v_float32 a1 = vg_load_f32(&in1[x]);
- v_float32 b1 = vg_load_f32(&in2[x]);
-
- vx_store(&out[x], div_op(t, a1, b1, scale));
+ div_hal(s_tag, &in1[x], &in2[x], &out[x], v_scale, v_zero);
}
if (x < length)
return x;
}
-//-------------------------------------------------------------------------------------------------
-
-template<typename scale_tag_t>
-CV_ALWAYS_INLINE int div_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[],
- const int length, double scale)
-{
- hal::div8u(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
- out, static_cast<size_t>(length), length, 1, &scale);
- return length;
-}
-
-template<typename scale_tag_t>
-CV_ALWAYS_INLINE int div_hal(scale_tag_t, const short in1[], const short in2[], short out[],
- const int length, double scale)
-{
- hal::div16s(in1, static_cast<size_t>(length), in2, static_cast<size_t>(length),
- out, static_cast<size_t>(length), length, 1, &scale);
- return length;
-}
-
-//-------------------------------------------------------------------------------------------------
-
#define DIV_SIMD(SRC, DST) \
int div_simd(const SRC in1[], const SRC in2[], DST out[], \
const int length, double _scale) \
float fscale = static_cast<float>(_scale); \
if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \
{ \
- not_scale_tag t; \
- x = div_hal(t, in1, in2, out, length, _scale); \
+ x = div_simd_common(not_scale_tag{}, in1, in2, out, length, fscale); \
} \
else \
{ \
- scale_tag t; \
- x = div_hal(t, in1, in2, out, length, _scale); \
+ x = div_simd_common(scale_tag{}, in1, in2, out, length, fscale); \
} \
return x; \
}
else \
{ \
return arithmOpScalarScaled_simd_common(op_t, in, scalar, out, \
- length, scale); \
+ length, scale); \
} \
} \
case 3: \
v_float32 a6 = vg_load_f32(&in[x + 5 * nlanes / 2]);
arithmOpScalar_pack_store_c3(&out[x], v_round(v_select(v_mask1, v_zero, div_op(s_tag, a1, s1, v_scale))),
- v_round(v_select(v_mask2, v_zero, div_op(s_tag, a2, s2, v_scale))),
- v_round(v_select(v_mask3, v_zero, div_op(s_tag, a3, s3, v_scale))),
- v_round(v_select(v_mask1, v_zero, div_op(s_tag, a4, s1, v_scale))),
- v_round(v_select(v_mask2, v_zero, div_op(s_tag, a5, s2, v_scale))),
- v_round(v_select(v_mask3, v_zero, div_op(s_tag, a6, s3, v_scale))));
+ v_round(v_select(v_mask2, v_zero, div_op(s_tag, a2, s2, v_scale))),
+ v_round(v_select(v_mask3, v_zero, div_op(s_tag, a3, s3, v_scale))),
+ v_round(v_select(v_mask1, v_zero, div_op(s_tag, a4, s1, v_scale))),
+ v_round(v_select(v_mask2, v_zero, div_op(s_tag, a5, s2, v_scale))),
+ v_round(v_select(v_mask3, v_zero, div_op(s_tag, a6, s3, v_scale))));
}
if (x < length)
#undef ABSDIFFC_SIMD
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST, typename Tvec>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+ std::is_same<DST, ushort>::value, void>::type
+divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx,
+ const v_float32& v_scalar, DST* outx,
+ const v_float32& v_scale, const Tvec& v_zero)
+{
+ div_simd_impl(s_tag, v_scalar, v_scalar, inx, outx, v_scale, v_zero);
+}
+
+template<typename scale_tag_t, typename SRC, typename DST, typename Tvec>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, uchar>::value, void>::type
+divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx,
+ const v_float32& v_scalar, DST* outx,
+ const v_float32& v_scale, const Tvec& v_zero)
+{
+ div_simd_impl(s_tag, v_scalar, v_scalar, v_scalar, v_scalar, inx, outx, v_scale, v_zero);
+}
+
+template<typename scale_tag_t, typename SRC, typename DST, typename Tvec>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, float>::value, void>::type
+divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx,
+ const v_float32& v_scalar, DST* outx,
+ const v_float32& v_scale, const Tvec&)
+{
+ div_simd_impl(s_tag, v_scalar, inx, outx, v_scale);
+}
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
+ const float scalar[], DST out[],
+ const int length, const float scale)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+ if (length < nlanes)
+ return 0;
+
+ v_float32 v_scalar = vx_load(scalar);
+ v_float32 v_scale = vx_setall_f32(scale);
+ zero_vec_type_of_t<SRC> v_zero =
+ vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+
+ int x = 0;
+ for (;;)
+ {
+ for (; x <= length - nlanes; x += nlanes)
+ {
+ divrc_simd_common_impl(s_tag, &in[x], v_scalar, &out[x], v_scale, v_zero);
+ }
+
+ if (x < length)
+ {
+ x = length - nlanes;
+ continue; // process unaligned tail
+ }
+ break;
+ }
+ return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t>
+CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uchar* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const v_uint8& v_zero)
+{
+ v_uint8 div = vx_load(inx);
+ v_uint8 v_mask = (div == v_zero);
+
+ v_uint16 div1 = v_expand_low(div);
+ v_uint16 div2 = v_expand_high(div);
+
+ v_float32 fdiv1 = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(div1)));
+ v_float32 fdiv2 = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(div1)));
+ v_float32 fdiv3 = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(div2)));
+ v_float32 fdiv4 = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(div2)));
+
+ vx_store(outx,
+ v_select(v_mask, v_zero, v_pack_u(v_pack(v_round(div_op(s_tag, s1, fdiv1, v_scale)),
+ v_round(div_op(s_tag, s2, fdiv2, v_scale))),
+ v_pack(v_round(div_op(s_tag, s3, fdiv3, v_scale)),
+ v_round(div_op(s_tag, s1, fdiv4, v_scale))))));
+}
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<SRC, short>::value ||
+ std::is_same<SRC, ushort>::value, void>::type
+divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const v_int16& v_zero)
+{
+ constexpr int nlanes = v_uint8::nlanes;
+
+ v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
+ v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
+
+ v_int16 v_mask1 = (div1 == v_zero);
+ v_int16 v_mask2 = (div2 == v_zero);
+
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
+ v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
+ v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
+
+ vx_store(outx,
+ v_pack_u(v_select(v_mask1, v_zero,
+ v_pack(v_round(div_op(s_tag, s1, fdiv1, v_scale)),
+ v_round(div_op(s_tag, s2, fdiv2, v_scale)))),
+ v_select(v_mask2, v_zero,
+ v_pack(v_round(div_op(s_tag, s3, fdiv3, v_scale)),
+ v_round(div_op(s_tag, s1, fdiv4, v_scale))))));
+}
+
+template<typename scale_tag_t>
+CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uchar* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const v_float32& v_zero)
+{
+ constexpr int nlanes = v_uint8::nlanes;
+
+ v_float32 fdiv1 = vg_load_f32(inx);
+ v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]);
+ v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]);
+ v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]);
+
+ v_float32 v_mask1 = (fdiv1 == v_zero);
+ v_float32 v_mask2 = (fdiv2 == v_zero);
+ v_float32 v_mask3 = (fdiv3 == v_zero);
+ v_float32 v_mask4 = (fdiv4 == v_zero);
+
+ vx_store(outx,
+ v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
+ v_round(v_select(v_mask2, v_zero, div_op(s_tag, s2, fdiv2, v_scale)))),
+ v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
+ v_round(v_select(v_mask4, v_zero, div_op(s_tag, s1, fdiv4, v_scale))))));
+
+}
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar out[],
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const int length, const int nlanes, const int lanes)
+{
+ univ_zero_vec_type_of_t<SRC> v_zero =
+ vx_setall<typename univ_zero_vec_type_of_t<SRC>::lane_type>(0);
+
+ int x = 0;
+ for (;;)
+ {
+ for (; x <= length - lanes; x += lanes)
+ {
+ divrc_simd_c3_calc(s_tag, &in[x], &out[x], s1, s2, s3, v_scale, v_zero);
+ divrc_simd_c3_calc(s_tag, &in[x + nlanes], &out[x + nlanes], s2, s3, s1, v_scale, v_zero);
+ divrc_simd_c3_calc(s_tag, &in[x + 2 * nlanes], &out[x + 2 * nlanes], s3, s1, s2, v_scale, v_zero);
+ }
+
+ if (x < length)
+ {
+ x = length - lanes;
+ continue; // process unaligned tail
+ }
+ break;
+ }
+ return x;
+}
+
+//---------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+ std::is_same<DST, ushort>::value, void>::type
+divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const v_int16& v_zero)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+ v_uint8 div = vx_load(inx);
+
+ v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div));
+ v_int16 div2 = v_reinterpret_as_s16(v_expand_high(div));
+ v_int16 div3 = v_reinterpret_as_s16(vx_load_expand(&inx[2 * nlanes]));
+
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
+ v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
+ v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
+ v_float32 fdiv5 = v_cvt_f32(v_expand_low(div3));
+ v_float32 fdiv6 = v_cvt_f32(v_expand_high(div3));
+
+ v_store_select(outx, div1, v_zero, v_round(div_op(s_tag, s1, fdiv1, v_scale)),
+ v_round(div_op(s_tag, s2, fdiv2, v_scale)));
+ v_store_select(&outx[nlanes], div2, v_zero, v_round(div_op(s_tag, s3, fdiv3, v_scale)),
+ v_round(div_op(s_tag, s1, fdiv4, v_scale)));
+ v_store_select(&outx[2*nlanes], div3, v_zero, v_round(div_op(s_tag, s2, fdiv5, v_scale)),
+ v_round(div_op(s_tag, s3, fdiv6, v_scale)));
+}
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, ushort>::value) ||
+ (std::is_same<SRC, ushort>::value && std::is_same<DST, ushort>::value) ||
+ (std::is_same<SRC, short>::value && std::is_same<DST, short>::value) ||
+ (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), void>::type
+divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const v_int16& v_zero)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+ v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
+ v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes]));
+ v_int16 div3 = v_reinterpret_as_s16(vx_load(&inx[2*nlanes]));
+
+ v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
+ v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
+ v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2));
+ v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2));
+ v_float32 fdiv5 = v_cvt_f32(v_expand_low(div3));
+ v_float32 fdiv6 = v_cvt_f32(v_expand_high(div3));
+
+ v_store_select(outx, div1, v_zero, v_round(div_op(s_tag, s1, fdiv1, v_scale)),
+ v_round(div_op(s_tag, s2, fdiv2, v_scale)));
+ v_store_select(&outx[nlanes], div2, v_zero, v_round(div_op(s_tag, s3, fdiv3, v_scale)),
+ v_round(div_op(s_tag, s1, fdiv4, v_scale)));
+ v_store_select(&outx[2*nlanes], div3, v_zero, v_round(div_op(s_tag, s2, fdiv5, v_scale)),
+ v_round(div_op(s_tag, s3, fdiv6, v_scale)));
+}
+
+template<typename scale_tag_t, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+ std::is_same<DST, ushort>::value, void>::type
+divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const v_float32& v_zero)
+{
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+ v_float32 fdiv1 = vg_load_f32(inx);
+ v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]);
+ v_float32 fdiv3 = vg_load_f32(&inx[nlanes]);
+ v_float32 fdiv4 = vg_load_f32(&inx[3*nlanes/2]);
+ v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]);
+ v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]);
+
+ v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
+ v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
+ v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
+ v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
+ v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
+ v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
+}
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+ std::is_same<DST, ushort>::value, int>::type
+divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32& s1,
+ const v_float32& s2, const v_float32& s3,
+ const v_float32& v_scale, const int length,
+ const int, const int lanes)
+{
+ zero_vec_type_of_t<SRC> v_zero =
+ vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+
+ int x = 0;
+ for (;;)
+ {
+ for (; x <= length - lanes; x += lanes)
+ {
+ divrc_simd_c3_calc(s_tag, &in[x], &out[x], s1, s2, s3, v_scale, v_zero);
+ }
+
+ if (x < length)
+ {
+ x = length - lanes;
+ continue; // process unaligned tail
+ }
+ break;
+ }
+ return x;
+}
+
+//---------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC>
+CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, float* out,
+ const v_float32& s1, const v_float32& s2,
+ const v_float32& s3, const v_float32& v_scale,
+ const int length, const int nlanes, const int lanes)
+{
+ int x = 0;
+ for (;;)
+ {
+ for (; x <= length - lanes; x += lanes)
+ {
+ v_float32 div1 = vg_load_f32(&in[x]);
+ v_float32 div2 = vg_load_f32(&in[x + nlanes]);
+ v_float32 div3 = vg_load_f32(&in[x + 2*nlanes]);
+
+ vx_store(&out[x], div_op(s_tag, s1, div1, v_scale));
+ vx_store(&out[x + nlanes], div_op(s_tag, s2, div2, v_scale));
+ vx_store(&out[x + 2*nlanes], div_op(s_tag, s3, div3, v_scale));
+ }
+
+ if (x < length)
+ {
+ x = length - lanes;
+ continue; // process unaligned tail
+ }
+ break;
+ }
+ return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename scale_tag_t, typename SRC, typename DST>
+CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[],
+ const float scalar[], DST out[],
+ const int length, const float scale)
+{
+ constexpr int chan = 3;
+ constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+ constexpr int lanes = chan * nlanes;
+
+ if (length < lanes)
+ return 0;
+
+ v_float32 v_scale = vx_setall_f32(scale);
+
+ v_float32 s1 = vx_load(scalar);
+#if CV_SIMD_WIDTH == 32
+ v_float32 s2 = vx_load(&scalar[2]);
+ v_float32 s3 = vx_load(&scalar[1]);
+#else
+ v_float32 s2 = vx_load(&scalar[1]);
+ v_float32 s3 = vx_load(&scalar[2]);
+#endif
+ return divrc_simd_c3_impl(s_tag, in, out, s1, s2, s3, v_scale, length, nlanes, lanes);
+}
+
+#define DIVRC_SIMD(SRC, DST) \
+int divrc_simd(const float scalar[], const SRC in[], DST out[], \
+ const int length, const int chan, const float scale) \
+{ \
+ switch (chan) \
+ { \
+ case 1: \
+ case 2: \
+ case 4: \
+ { \
+ if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
+ { \
+ return divrc_simd_common(not_scale_tag{}, in, scalar, \
+ out, length, scale); \
+ } \
+ else \
+ { \
+ return divrc_simd_common(scale_tag{}, in, scalar, out, \
+ length, scale); \
+ } \
+ } \
+ case 3: \
+ { \
+ if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \
+ { \
+ return divrc_simd_c3(not_scale_tag{}, in, scalar, \
+ out, length, scale); \
+ } \
+ else \
+ { \
+ return divrc_simd_c3(scale_tag{}, in, scalar, out, \
+ length, scale); \
+ } \
+ } \
+ default: \
+ GAPI_Assert(chan <= 4); \
+ break; \
+ } \
+ return 0; \
+}
+
+DIVRC_SIMD(uchar, uchar)
+DIVRC_SIMD(ushort, uchar)
+DIVRC_SIMD(short, uchar)
+DIVRC_SIMD(float, uchar)
+DIVRC_SIMD(short, short)
+DIVRC_SIMD(ushort, short)
+DIVRC_SIMD(uchar, short)
+DIVRC_SIMD(float, short)
+DIVRC_SIMD(ushort, ushort)
+DIVRC_SIMD(uchar, ushort)
+DIVRC_SIMD(short, ushort)
+DIVRC_SIMD(float, ushort)
+DIVRC_SIMD(uchar, float)
+DIVRC_SIMD(ushort, float)
+DIVRC_SIMD(short, float)
+DIVRC_SIMD(float, float)
+
+#undef DIVRC_SIMD
+
//-------------------------
//
// Fluid kernels: Split3
//
//-------------------------
-int split3_simd(const uchar in[], uchar out1[], uchar out2[],
- uchar out3[], const int width)
+int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
+ const int width)
{
constexpr int nlanes = v_uint8::nlanes;
if (width < nlanes)