int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
+ for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#endif
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
- for( ; i < len - inc_i; i += inc_i, j += inc_j)
+ for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#endif
{
template<typename T, typename DT, typename WT>
-struct cvtScaleAbs_SSE2
+struct cvtScaleAbs_SIMD
{
int operator () (const T *, DT *, int, WT, WT) const
{
#if CV_SSE2
template <>
-struct cvtScaleAbs_SSE2<uchar, uchar, float>
+struct cvtScaleAbs_SIMD<uchar, uchar, float>
{
int operator () (const uchar * src, uchar * dst, int width,
float scale, float shift) const
};
template <>
-struct cvtScaleAbs_SSE2<ushort, uchar, float>
+struct cvtScaleAbs_SIMD<ushort, uchar, float>
{
int operator () (const ushort * src, uchar * dst, int width,
float scale, float shift) const
};
template <>
-struct cvtScaleAbs_SSE2<short, uchar, float>
+struct cvtScaleAbs_SIMD<short, uchar, float>
{
int operator () (const short * src, uchar * dst, int width,
float scale, float shift) const
};
template <>
-struct cvtScaleAbs_SSE2<int, uchar, float>
+struct cvtScaleAbs_SIMD<int, uchar, float>
{
int operator () (const int * src, uchar * dst, int width,
float scale, float shift) const
};
template <>
-struct cvtScaleAbs_SSE2<float, uchar, float>
+struct cvtScaleAbs_SIMD<float, uchar, float>
{
int operator () (const float * src, uchar * dst, int width,
float scale, float shift) const
}
};
+#elif CV_NEON
+
+template <>
+struct cvtScaleAbs_SIMD<float, uchar, float>
+{
+ int operator () (const float * src, uchar * dst, int width,
+ float scale, float shift) const
+ {
+ int x = 0;
+ float32x4_t v_shift = vdupq_n_f32(shift);
+
+ for ( ; x <= width - 8; x += 8)
+ {
+ float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
+ v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
+ uint16x4_t v_dsti_0 = vqmovun_s32(vcvtq_s32_f32(v_dst_0));
+
+ float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
+ v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
+ uint16x4_t v_dsti_1 = vqmovun_s32(vcvtq_s32_f32(v_dst_1));
+
+ uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
+ vst1_u8(dst + x, vqmovn_u16(v_dst));
+ }
+
+ return x;
+ }
+};
+
#endif
template<typename T, typename DT, typename WT> static void
{
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
- cvtScaleAbs_SSE2<T, DT, WT> vop;
+ cvtScaleAbs_SIMD<T, DT, WT> vop;
for( ; size.height--; src += sstep, dst += dstep )
{