for ( ; j <= width - 8; j += 8)
{
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
- vst1q_s32(_norm + j, vaddq_s32(vmovl_s16(vget_low_s16(v_dx)), vmovl_s16(vget_low_s16(v_dy))));
- vst1q_s32(_norm + j + 4, vaddq_s32(vmovl_s16(vget_high_s16(v_dx)), vmovl_s16(vget_high_s16(v_dy))));
+ vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
+ vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
+ vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
+ vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
}
#endif
for ( ; j < width; ++j)
for ( ; j <= width - 8; j += 8)
{
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
- int32x4_t v_dxp = vmovl_s16(vget_low_s16(v_dx)), v_dyp = vmovl_s16(vget_low_s16(v_dy));
- int32x4_t v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp));
+ int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
+ int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
vst1q_s32(_norm + j, v_dst);
- v_dxp = vmovl_s16(vget_high_s16(v_dx)), v_dyp = vmovl_s16(vget_high_s16(v_dy));
- v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp));
- vst1q_s32(_norm + j, v_dst);
+ v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
+ v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
+ vst1q_s32(_norm + j + 4, v_dst);
}
#endif
for ( ; j < width; ++j)
}
}
#elif CV_NEON
- float32x4_t v_k = vdupq_n_f32((float)k));
+ float32x4_t v_k = vdupq_n_f32((float)k);
for( ; j <= size.width - 4; j += 4 )
{
float32x4x3_t v_src = vld3q_f32(cov + j + 3);
float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2];
- float32x4_t v_ac_bb = vsubq_f32(vmulq_f32(v_a, v_c), vmulq_f32(v_b, v_b));
+ float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b);
float32x4_t v_ac = vaddq_f32(v_a, v_c);
- float32x4_t v_prod = vmulq_f32(v_k, vmulq_f32(v_ac, v_ac));
- vst1q_f32(dst + j, vsubq_f32(v_ac_bb, v_prod));
+ vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac)));
}
#endif
if( src.depth() == CV_8U )
factor *= 255;
factor = 1./(factor * factor * factor);
+ float factor_f = (float)factor;
#if CV_SSE2
volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
- __m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f);
+ __m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f);
#endif
Size size = src.size();
for( ; j <= size.width - 4; j += 4 )
{
float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j);
- float32x4_t v_s1 = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j)));
- float32x4_t v_s2 = vmulq_f32(v_dy, vmulq_f32(v_dy, vld1q_f32(d2xdata + j)));
- float32x4_t v_s3 = vmulq_f32(v_dx, vmulq_f32(v_dy, vld1q_f32(dxydata + j)));
- vst1q_f32(dstdata + j, vaddq_f32(vaddq_f32(v_s1, v_s2), vmulq_n_f32(v_s3, -2.0f)));
+ float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j)));
+ v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy));
+ v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2));
+ vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f));
}
#endif
{ return 0; }
};
-#if CV_SSE2
+#if CV_NEON
+
+class ResizeAreaFastVec_SIMD_8u
+{
+public:
+ ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
+ cn(_cn), step(_step)
+ {
+ }
+
+ int operator() (const uchar* S, uchar* D, int w) const
+ {
+ int dx = 0;
+ const uchar* S0 = S, * S1 = S0 + step;
+
+ uint16x8_t v_2 = vdupq_n_u16(2);
+
+ if (cn == 1)
+ {
+ for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
+ {
+ uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
+
+ uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
+ v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
+ v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
+
+ uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
+ v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
+ v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
+
+ vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
+ }
+ }
+ else if (cn == 4)
+ {
+ for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+ {
+ uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
+
+ uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
+ uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
+ uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
+ uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
+
+ uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
+ vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
+ uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
+ vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
+ uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
+
+ vst1_u8(D, vmovn_u16(v_dst));
+ }
+ }
+
+ return dx;
+ }
+
+private:
+ int cn, step;
+};
+
+typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
+
+#elif CV_SSE2
+
class ResizeAreaFastVec_SIMD_8u
{
public:
TEST(Imgproc_GetRectSubPix, accuracy) { CV_GetRectSubPixTest test; test.safe_run(); }
TEST(Imgproc_GetQuadSubPix, accuracy) { CV_GetQuadSubPixTest test; test.safe_run(); }
+//////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename WT>
+void resizeArea(const cv::Mat & src, cv::Mat & dst)
+{
+ int cn = src.channels();
+
+ for (int y = 0; y < dst.rows; ++y)
+ {
+ const T * sptr0 = src.ptr<T>(y << 1);
+ const T * sptr1 = src.ptr<T>((y << 1) + 1);
+ T * dptr = dst.ptr<T>(y);
+
+ for (int x = 0; x < dst.cols * cn; x += cn)
+ {
+ int x1 = x << 1;
+
+ for (int c = 0; c < cn; ++c)
+ {
+ WT sum = WT(sptr0[x1 + c]) + WT(sptr0[x1 + c + cn]);
+ sum += WT(sptr1[x1 + c]) + WT(sptr1[x1 + c + cn]) + (WT)(2);
+
+ dptr[x + c] = cv::saturate_cast<T>(sum >> 2);
+ }
+ }
+ }
+}
+
+TEST(Resize, Area_half)
+{
+ int types[] = { CV_8UC1, CV_8UC4 };
+
+ for (int i = 0, size = sizeof(types) / sizeof(types[0]); i < size; ++i)
+ {
+ int type = types[i];
+ cv::Mat src(100, 100, type), dst_actual(50, 50, type), dst_reference(50, 50, type);
+
+ if (CV_MAT_DEPTH(type) == CV_8U)
+ resizeArea<uchar, ushort>(src, dst_reference);
+ else
+ CV_Assert(0);
+
+ cv::resize(src, dst_actual, dst_actual.size(), 0, 0, cv::INTER_AREA);
+
+ ASSERT_EQ(0, cvtest::norm(dst_reference, dst_actual, cv::NORM_INF));
+ }
+}
+
/* End of file. */