From c0b702a994f8debe0e30cf782f82f80cc2bfbc99 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 25 Sep 2014 00:17:23 +0400 Subject: [PATCH] cv::resize area 2x --- modules/imgproc/src/canny.cpp | 16 +++++---- modules/imgproc/src/corner.cpp | 18 +++++----- modules/imgproc/src/imgwarp.cpp | 67 ++++++++++++++++++++++++++++++++++- modules/imgproc/test/test_imgwarp.cpp | 48 +++++++++++++++++++++++++ 4 files changed, 132 insertions(+), 17 deletions(-) diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 11365a3..e9e64c5 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -365,8 +365,10 @@ void cv::Canny( InputArray _src, OutputArray _dst, for ( ; j <= width - 8; j += 8) { int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); - vst1q_s32(_norm + j, vaddq_s32(vmovl_s16(vget_low_s16(v_dx)), vmovl_s16(vget_low_s16(v_dy)))); - vst1q_s32(_norm + j + 4, vaddq_s32(vmovl_s16(vget_high_s16(v_dx)), vmovl_s16(vget_high_s16(v_dy)))); + vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))), + vabsq_s32(vmovl_s16(vget_low_s16(v_dy))))); + vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))), + vabsq_s32(vmovl_s16(vget_high_s16(v_dy))))); } #endif for ( ; j < width; ++j) @@ -397,13 +399,13 @@ void cv::Canny( InputArray _src, OutputArray _dst, for ( ; j <= width - 8; j += 8) { int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); - int32x4_t v_dxp = vmovl_s16(vget_low_s16(v_dx)), v_dyp = vmovl_s16(vget_low_s16(v_dy)); - int32x4_t v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp)); + int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy); + int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); vst1q_s32(_norm + j, v_dst); - v_dxp = vmovl_s16(vget_high_s16(v_dx)), v_dyp = vmovl_s16(vget_high_s16(v_dy)); - v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp)); - vst1q_s32(_norm + j, v_dst); + v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy); + v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); + vst1q_s32(_norm + j + 4, v_dst); } #endif for ( ; j < width; ++j) diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 01265a5..60d14cf 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -147,16 +147,15 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) } } #elif CV_NEON - float32x4_t v_k = vdupq_n_f32((float)k)); + float32x4_t v_k = vdupq_n_f32((float)k); for( ; j <= size.width - 4; j += 4 ) { float32x4x3_t v_src = vld3q_f32(cov + j + 3); float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2]; - float32x4_t v_ac_bb = vsubq_f32(vmulq_f32(v_a, v_c), vmulq_f32(v_b, v_b)); + float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b); float32x4_t v_ac = vaddq_f32(v_a, v_c); - float32x4_t v_prod = vmulq_f32(v_k, vmulq_f32(v_ac, v_ac)); - vst1q_f32(dst + j, vsubq_f32(v_ac_bb, v_prod)); + vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac))); } #endif @@ -619,10 +618,11 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord if( src.depth() == CV_8U ) factor *= 255; factor = 1./(factor * factor * factor); + float factor_f = (float)factor; #if CV_SSE2 volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); - __m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f); + __m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f); #endif Size size = src.size(); @@ -657,10 +657,10 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord for( ; j <= size.width - 4; j += 4 ) { float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j); - float32x4_t v_s1 = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j))); - float32x4_t v_s2 = vmulq_f32(v_dy, vmulq_f32(v_dy, vld1q_f32(d2xdata + j))); - float32x4_t v_s3 = vmulq_f32(v_dx, vmulq_f32(v_dy, vld1q_f32(dxydata + j))); - vst1q_f32(dstdata + j, vaddq_f32(vaddq_f32(v_s1, v_s2), vmulq_n_f32(v_s3, -2.0f))); + float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j))); + v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy)); + v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2)); + vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f)); } #endif diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index fd5f191..01fc592 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1322,7 +1322,72 @@ struct ResizeAreaFastNoVec { return 0; } }; -#if CV_SSE2 +#if CV_NEON + +class ResizeAreaFastVec_SIMD_8u +{ +public: + ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : + cn(_cn), step(_step) + { + } + + int operator() (const uchar* S, uchar* D, int w) const + { + int dx = 0; + const uchar* S0 = S, * S1 = S0 + step; + + uint16x8_t v_2 = vdupq_n_u16(2); + + if (cn == 1) + { + for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16) + { + uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1); + + uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1])); + v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1]))); + v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2); + + uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1])); + v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1]))); + v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2); + + vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1); + + uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0)); + uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0)); + uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1)); + uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1)); + + uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)), + vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10))); + uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)), + vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11))); + uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2); + + vst1_u8(D, vmovn_u16(v_dst)); + } + } + + return dx; + } + +private: + int cn, step; +}; + +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16u; + +#elif CV_SSE2 + class ResizeAreaFastVec_SIMD_8u { public: diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp index 3146ff7..91470cf 100644 --- a/modules/imgproc/test/test_imgwarp.cpp +++ b/modules/imgproc/test/test_imgwarp.cpp @@ -1545,4 +1545,52 @@ TEST(Imgproc_InitUndistortMap, accuracy) { CV_UndistortMapTest test; test.safe_r TEST(Imgproc_GetRectSubPix, accuracy) { CV_GetRectSubPixTest test; test.safe_run(); } TEST(Imgproc_GetQuadSubPix, accuracy) { CV_GetQuadSubPixTest test; test.safe_run(); } +////////////////////////////////////////////////////////////////////////// + +template +void resizeArea(const cv::Mat & src, cv::Mat & dst) +{ + int cn = src.channels(); + + for (int y = 0; y < dst.rows; ++y) + { + const T * sptr0 = src.ptr(y << 1); + const T * sptr1 = src.ptr((y << 1) + 1); + T * dptr = dst.ptr(y); + + for (int x = 0; x < dst.cols * cn; x += cn) + { + int x1 = x << 1; + + for (int c = 0; c < cn; ++c) + { + WT sum = WT(sptr0[x1 + c]) + WT(sptr0[x1 + c + cn]); + sum += WT(sptr1[x1 + c]) + WT(sptr1[x1 + c + cn]) + (WT)(2); + + dptr[x + c] = cv::saturate_cast(sum >> 2); + } + } + } +} + +TEST(Resize, Area_half) +{ + int types[] = { CV_8UC1, CV_8UC4 }; + + for (int i = 0, size = sizeof(types) / sizeof(types[0]); i < size; ++i) + { + int type = types[i]; + cv::Mat src(100, 100, type), dst_actual(50, 50, type), dst_reference(50, 50, type); + + if (CV_MAT_DEPTH(type) == CV_8U) + resizeArea(src, dst_reference); + else + CV_Assert(0); + + cv::resize(src, dst_actual, dst_actual.size(), 0, 0, cv::INTER_AREA); + + ASSERT_EQ(0, cvtest::norm(dst_reference, dst_actual, cv::NORM_INF)); + } +} + /* End of file. */ -- 2.7.4