From a84bbc62b1188fc39b18296a0c8cbe6de19e5bfe Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Fri, 18 Jan 2019 00:11:52 +0300 Subject: [PATCH] boundingRect() reworked to use wide universal intrinsics --- modules/imgproc/src/shapedescr.cpp | 210 +++++++++++++++++++++++-------------- 1 file changed, 132 insertions(+), 78 deletions(-) diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp index d505fde..436c74e 100644 --- a/modules/imgproc/src/shapedescr.cpp +++ b/modules/imgproc/src/shapedescr.cpp @@ -39,6 +39,8 @@ // //M*/ #include "precomp.hpp" +#include "opencv2/core/hal/intrin.hpp" + namespace cv { @@ -746,109 +748,161 @@ static Rect pointSetBoundingRect( const Mat& points ) if( npoints == 0 ) return Rect(); - const Point* pts = points.ptr(); - Point pt = pts[0]; +#if CV_SIMD + const int64_t* pts = points.ptr(); -#if CV_SSE4_2 - if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) + if( !is_float ) { - if( !is_float ) + v_int32 minval, maxval; + minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y + for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 ) { - __m128i minval, maxval; - minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y - - for( i = 1; i < npoints; i++ ) - { - __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]); - minval = _mm_min_epi32(ptXY, minval); - maxval = _mm_max_epi32(ptXY, maxval); - } - xmin = _mm_cvtsi128_si32(minval); - ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); - xmax = _mm_cvtsi128_si32(maxval); - ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); + v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i)); + minval = v_min(ptXY2, minval); + maxval = v_max(ptXY2, maxval); } - else + minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); + if( i <= npoints - v_int32::nlanes/4 ) { - __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); - minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); - - for( i = 1; i < npoints; i++ ) + v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i)))); + minval = v_min(ptXY, minval); + maxval = v_max(ptXY, maxval); + i += v_int64::nlanes/2; + } + for(int j = 16; j < CV_SIMD_WIDTH; j*=2) + { + minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval)))); + } + xmin = minval.get0(); + xmax = maxval.get0(); + ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0(); + ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0(); +#if CV_SIMD_WIDTH > 16 + if( i < npoints ) + { + v_int32x4 minval2, maxval2; + minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + for( i++; i < npoints; i++ ) { - ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]); - - minvalf = _mm_min_ps(minvalf, ptXY); - maxvalf = _mm_max_ps(maxvalf, ptXY); + v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + minval2 = v_min(ptXY, minval2); + maxval2 = v_max(ptXY, maxval2); } - - float xyminf[2], xymaxf[2]; - _mm_storel_pi((__m64*)xyminf, minvalf); - _mm_storel_pi((__m64*)xymaxf, maxvalf); - xmin = cvFloor(xyminf[0]); - ymin = cvFloor(xyminf[1]); - xmax = cvFloor(xymaxf[0]); - ymax = cvFloor(xymaxf[1]); + xmin = min(xmin, minval2.get0()); + xmax = max(xmax, maxval2.get0()); + ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()); + ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()); } +#endif } else -#endif { - if( !is_float ) + v_float32 minval, maxval; + minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y + for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 ) { - xmin = xmax = pt.x; - ymin = ymax = pt.y; - - for( i = 1; i < npoints; i++ ) + v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i)); + minval = v_min(ptXY2, minval); + maxval = v_max(ptXY2, maxval); + } + minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))); + if( i <= npoints - v_float32::nlanes/4 ) + { + v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i)))); + minval = v_min(ptXY, minval); + maxval = v_max(ptXY, maxval); + i += v_float32::nlanes/4; + } + for(int j = 16; j < CV_SIMD_WIDTH; j*=2) + { + minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))); + maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))); + } + xmin = cvFloor(minval.get0()); + xmax = cvFloor(maxval.get0()); + ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0()); + ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0()); +#if CV_SIMD_WIDTH > 16 + if( i < npoints ) + { + v_float32x4 minval2, maxval2; + minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + for( i++; i < npoints; i++ ) { - pt = pts[i]; + v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i)))); + minval2 = v_min(ptXY, minval2); + maxval2 = v_max(ptXY, maxval2); + } + xmin = min(xmin, cvFloor(minval2.get0())); + xmax = max(xmax, cvFloor(maxval2.get0())); + ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0())); + ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0())); + } +#endif + } +#else + const Point* pts = points.ptr(); + Point pt = pts[0]; - if( xmin > pt.x ) - xmin = pt.x; + if( !is_float ) + { + xmin = xmax = pt.x; + ymin = ymax = pt.y; - if( xmax < pt.x ) - xmax = pt.x; + for( i = 1; i < npoints; i++ ) + { + pt = pts[i]; - if( ymin > pt.y ) - ymin = pt.y; + if( xmin > pt.x ) + xmin = pt.x; - if( ymax < pt.y ) - ymax = pt.y; - } - } - else - { - Cv32suf v; - // init values - xmin = xmax = CV_TOGGLE_FLT(pt.x); - ymin = ymax = CV_TOGGLE_FLT(pt.y); + if( xmax < pt.x ) + xmax = pt.x; - for( i = 1; i < npoints; i++ ) - { - pt = pts[i]; - pt.x = CV_TOGGLE_FLT(pt.x); - pt.y = CV_TOGGLE_FLT(pt.y); + if( ymin > pt.y ) + ymin = pt.y; - if( xmin > pt.x ) - xmin = pt.x; + if( ymax < pt.y ) + ymax = pt.y; + } + } + else + { + Cv32suf v; + // init values + xmin = xmax = CV_TOGGLE_FLT(pt.x); + ymin = ymax = CV_TOGGLE_FLT(pt.y); - if( xmax < pt.x ) - xmax = pt.x; + for( i = 1; i < npoints; i++ ) + { + pt = pts[i]; + pt.x = CV_TOGGLE_FLT(pt.x); + pt.y = CV_TOGGLE_FLT(pt.y); - if( ymin > pt.y ) - ymin = pt.y; + if( xmin > pt.x ) + xmin = pt.x; - if( ymax < pt.y ) - ymax = pt.y; - } + if( xmax < pt.x ) + xmax = pt.x; + + if( ymin > pt.y ) + ymin = pt.y; - v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); - // because right and bottom sides of the bounding rectangle are not inclusive - // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil - v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); + if( ymax < pt.y ) + ymax = pt.y; } + + v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); + // because right and bottom sides of the bounding rectangle are not inclusive + // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil + v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } +#endif return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1); } -- 2.7.4