From d090fcf2fe2c8127453c2e1b93bc62c3f37151a5 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 26 Sep 2014 14:39:04 +0000 Subject: [PATCH] cv::moments (CV_8UC1) --- modules/imgproc/src/moments.cpp | 91 +++++++++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 13 deletions(-) diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index d68248c..b292d99 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -203,7 +203,7 @@ static Moments contourMoments( const Mat& contour ) \****************************************************************************************/ template -struct MomentsInTile_SSE +struct MomentsInTile_SIMD { int operator() (const T *, int, WT &, WT &, WT &, MT &) { @@ -214,9 +214,9 @@ struct MomentsInTile_SSE #if CV_SSE2 template <> -struct MomentsInTile_SSE +struct MomentsInTile_SIMD { - MomentsInTile_SSE() + MomentsInTile_SIMD() { useSIMD = checkHardwareSupport(CV_CPU_SSE2); } @@ -234,17 +234,16 @@ struct MomentsInTile_SSE for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); - qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); - __m128i px = _mm_mullo_epi16(p, qx); __m128i sx = _mm_mullo_epi16(qx, qx); + + qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); - qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); + qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); qx = _mm_add_epi16(qx, dx); } - int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); @@ -258,17 +257,84 @@ struct MomentsInTile_SSE return x; } + int CV_DECL_ALIGNED(16) buf[4]; bool useSIMD; }; +#elif CV_NEON + +template <> +struct MomentsInTile_SIMD +{ + MomentsInTile_SIMD() + { + ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 }; + qx_init = vld1_u16(init); + v_step = vdup_n_u16(4); + } + + int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) + { + int x = 0; + + uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z, + v_x2 = v_z, v_x3 = v_z; + uint16x4_t qx = qx_init; + + for( ; x <= len - 8; x += 8 ) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x)); + + // first part + uint32x4_t v_qx = vmovl_u16(qx); + uint16x4_t v_p = vget_low_u16(v_src); + uint32x4_t v_px = vmull_u16(qx, v_p); + + v_x0 = vaddw_u16(v_x0, v_p); + v_x1 = vaddq_u32(v_x1, v_px); + v_px = vmulq_u32(v_px, v_qx); + v_x2 = vaddq_u32(v_x2, v_px); + v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); + qx = vadd_u16(qx, v_step); + + // second part + v_qx = vmovl_u16(qx); + v_p = vget_high_u16(v_src); + v_px = vmull_u16(qx, v_p); + + v_x0 = vaddw_u16(v_x0, v_p); + v_x1 = vaddq_u32(v_x1, v_px); + v_px = vmulq_u32(v_px, v_qx); + v_x2 = vaddq_u32(v_x2, v_px); + v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); + + qx = vadd_u16(qx, v_step); + } + + vst1q_u32(buf, v_x0); + x0 = buf[0] + buf[1] + buf[2] + buf[3]; + vst1q_u32(buf, v_x1); + x1 = buf[0] + buf[1] + buf[2] + buf[3]; + vst1q_u32(buf, v_x2); + x2 = buf[0] + buf[1] + buf[2] + buf[3]; + vst1q_u32(buf, v_x3); + x3 = buf[0] + buf[1] + buf[2] + buf[3]; + + return x; + } + + uint CV_DECL_ALIGNED(16) buf[4]; + uint16x4_t qx_init, v_step; +}; + #endif #if CV_SSE4_1 template <> -struct MomentsInTile_SSE +struct MomentsInTile_SIMD { - MomentsInTile_SSE() + MomentsInTile_SIMD() { useSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } @@ -302,9 +368,6 @@ struct MomentsInTile_SSE v_ix1 = _mm_add_epi32(v_ix1, v_delta); } - int CV_DECL_ALIGNED(16) buf[4]; - int64 CV_DECL_ALIGNED(16) buf64[2]; - _mm_store_si128((__m128i*)buf, v_x0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, v_x1); @@ -319,6 +382,8 @@ struct MomentsInTile_SSE return x; } + int CV_DECL_ALIGNED(16) buf[4]; + int64 CV_DECL_ALIGNED(16) buf64[2]; bool useSIMD; }; @@ -334,7 +399,7 @@ static void momentsInTile( const Mat& img, double* moments ) Size size = img.size(); int x, y; MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; - MomentsInTile_SSE vop; + MomentsInTile_SIMD vop; for( y = 0; y < size.height; y++ ) { -- 2.7.4