From: Tomoaki Teshima Date: Fri, 20 Jan 2017 10:22:44 +0000 (+0900) Subject: use universal intrinsic in corner detection series X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1257^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=062d2179eb8e755b58e07cd761fdfe3faf64d521;p=platform%2Fupstream%2Fopencv.git use universal intrinsic in corner detection series --- diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 7f3bad5..dc84897 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -43,6 +43,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" namespace cv { @@ -51,8 +52,8 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst ) { int i, j; Size size = _cov.size(); -#if CV_SSE - volatile bool simd = checkHardwareSupport(CV_CPU_SSE); +#if CV_SIMD128 + bool simd = hasSIMD128(); #endif if( _cov.isContinuous() && _dst.isContinuous() ) @@ -66,44 +67,21 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst ) const float* cov = _cov.ptr(i); float* dst = _dst.ptr(i); j = 0; - #if CV_SSE + #if CV_SIMD128 if( simd ) { - __m128 half = _mm_set1_ps(0.5f); - for( ; j <= size.width - 4; j += 4 ) + v_float32x4 half = v_setall_f32(0.5f); + for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) { - __m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x - __m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x - __m128 t2 = _mm_loadu_ps(cov + j*3 + 6); // a2 b2 c2 x - __m128 t3 = _mm_loadu_ps(cov + j*3 + 9); // a3 b3 c3 x - __m128 a, b, c, t; - t = _mm_unpacklo_ps(t0, t1); // a0 a1 b0 b1 - c = _mm_unpackhi_ps(t0, t1); // c0 c1 x x - b = _mm_unpacklo_ps(t2, t3); // a2 a3 b2 b3 - c = _mm_movelh_ps(c, _mm_unpackhi_ps(t2, t3)); // c0 c1 c2 c3 - a = _mm_movelh_ps(t, b); - b = _mm_movehl_ps(b, t); - a = _mm_mul_ps(a, half); - c = _mm_mul_ps(c, half); - t = _mm_sub_ps(a, c); - t = _mm_add_ps(_mm_mul_ps(t, t), _mm_mul_ps(b,b)); - a = _mm_sub_ps(_mm_add_ps(a, c), _mm_sqrt_ps(t)); - _mm_storeu_ps(dst + j, a); + v_float32x4 v_a, v_b, v_c, v_t; + v_load_deinterleave(cov + j*3, v_a, v_b, v_c); + v_a *= half; + v_c *= half; + v_t = v_a - v_c; + v_t = v_muladd(v_b, v_b, (v_t * v_t)); + v_store(dst + j, (v_a + v_c) - v_sqrt(v_t)); } } - #elif CV_NEON - float32x4_t v_half = vdupq_n_f32(0.5f); - for( ; j <= size.width - 4; j += 4 ) - { - float32x4x3_t v_src = vld3q_f32(cov + j * 3); - float32x4_t v_a = vmulq_f32(v_src.val[0], v_half); - float32x4_t v_b = v_src.val[1]; - float32x4_t v_c = vmulq_f32(v_src.val[2], v_half); - - float32x4_t v_t = vsubq_f32(v_a, v_c); - v_t = vmlaq_f32(vmulq_f32(v_t, v_t), v_b, v_b); - vst1q_f32(dst + j, vsubq_f32(vaddq_f32(v_a, v_c), cv_vsqrtq_f32(v_t))); - } #endif for( ; j < size.width; j++ ) { @@ -120,8 +98,8 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) { int i, j; Size size = _cov.size(); -#if CV_SSE - volatile bool simd = checkHardwareSupport(CV_CPU_SSE); +#if CV_SIMD128 + bool simd = hasSIMD128(); #endif if( _cov.isContinuous() && _dst.isContinuous() ) @@ -136,40 +114,21 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) float* dst = _dst.ptr(i); j = 0; - #if CV_SSE + #if CV_SIMD128 if( simd ) { - __m128 k4 = _mm_set1_ps((float)k); - for( ; j <= size.width - 4; j += 4 ) + v_float32x4 v_k = v_setall_f32((float)k); + + for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) { - __m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x - __m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x - __m128 t2 = _mm_loadu_ps(cov + j*3 + 6); // a2 b2 c2 x - __m128 t3 = _mm_loadu_ps(cov + j*3 + 9); // a3 b3 c3 x - __m128 a, b, c, t; - t = _mm_unpacklo_ps(t0, t1); // a0 a1 b0 b1 - c = _mm_unpackhi_ps(t0, t1); // c0 c1 x x - b = _mm_unpacklo_ps(t2, t3); // a2 a3 b2 b3 - c = _mm_movelh_ps(c, _mm_unpackhi_ps(t2, t3)); // c0 c1 c2 c3 - a = _mm_movelh_ps(t, b); - b = _mm_movehl_ps(b, t); - t = _mm_add_ps(a, c); - a = _mm_sub_ps(_mm_mul_ps(a, c), _mm_mul_ps(b, b)); - t = _mm_mul_ps(_mm_mul_ps(k4, t), t); - a = _mm_sub_ps(a, t); - _mm_storeu_ps(dst + j, a); - } - } - #elif CV_NEON - float32x4_t v_k = vdupq_n_f32((float)k); + v_float32x4 v_a, v_b, v_c; + v_load_deinterleave(cov + j * 3, v_a, v_b, v_c); - for( ; j <= size.width - 4; j += 4 ) - { - float32x4x3_t v_src = vld3q_f32(cov + j * 3); - float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2]; - float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b); - float32x4_t v_ac = vaddq_f32(v_a, v_c); - vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac))); + v_float32x4 v_ac_bb = v_a * v_c - v_b * v_b; + v_float32x4 v_ac = v_a + v_c; + v_float32x4 v_dst = v_ac_bb - v_k * v_ac * v_ac; + v_store(dst + j, v_dst); + } } #endif @@ -272,8 +231,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, if (tegra::useTegra() && tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType)) return; #endif -#if CV_SSE2 - bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#if CV_SIMD128 + bool haveSimd = hasSIMD128(); #endif int depth = src.depth(); @@ -309,44 +268,20 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, const float* dydata = Dy.ptr(i); j = 0; - #if CV_NEON - for( ; j <= size.width - 4; j += 4 ) + #if CV_SIMD128 + if (haveSimd) { - float32x4_t v_dx = vld1q_f32(dxdata + j); - float32x4_t v_dy = vld1q_f32(dydata + j); + for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) + { + v_float32x4 v_dx = v_load(dxdata + j); + v_float32x4 v_dy = v_load(dydata + j); - float32x4x3_t v_dst; - v_dst.val[0] = vmulq_f32(v_dx, v_dx); - v_dst.val[1] = vmulq_f32(v_dx, v_dy); - v_dst.val[2] = vmulq_f32(v_dy, v_dy); + v_float32x4 v_dst0, v_dst1, v_dst2; + v_dst0 = v_dx * v_dx; + v_dst1 = v_dx * v_dy; + v_dst2 = v_dy * v_dy; - vst3q_f32(cov_data + j * 3, v_dst); - } - #elif CV_SSE2 - if (haveSSE2) - { - for( ; j <= size.width - 8; j += 8 ) - { - __m128 v_dx_0 = _mm_loadu_ps(dxdata + j); - __m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4); - __m128 v_dy_0 = _mm_loadu_ps(dydata + j); - __m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4); - - __m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0); - __m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0); - __m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0); - __m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1); - __m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1); - __m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1); - - _mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1); - - _mm_storeu_ps(cov_data + j * 3, v_dx2_0); - _mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1); - _mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0); - _mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1); - _mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0); - _mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1); + v_store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2); } } #endif @@ -751,13 +686,10 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord if( src.depth() == CV_8U ) factor *= 255; factor = 1./(factor * factor * factor); -#if CV_NEON || CV_SSE2 +#if CV_SIMD128 float factor_f = (float)factor; -#endif - -#if CV_SSE2 - volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); - __m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f); + bool haveSimd = hasSIMD128(); + v_float32x4 v_factor = v_setall_f32(factor_f), v_m2 = v_setall_f32(-2.0f); #endif Size size = src.size(); @@ -773,30 +705,21 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord j = 0; -#if CV_SSE2 - if (haveSSE2) +#if CV_SIMD128 + if (haveSimd) { - for( ; j <= size.width - 4; j += 4 ) + for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes ) { - __m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j)); - __m128 v_dy = _mm_loadu_ps((const float *)(dydata + j)); - - __m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j))); - __m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j))); - __m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j))); - v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2)))); - _mm_storeu_ps(dstdata + j, v_s1); + v_float32x4 v_dx = v_load(dxdata + j); + v_float32x4 v_dy = v_load(dydata + j); + + v_float32x4 v_s1 = (v_dx * v_dx) * v_load(d2ydata + j); + v_float32x4 v_s2 = v_muladd((v_dy * v_dy), v_load(d2xdata + j), v_s1); + v_float32x4 v_s3 = v_muladd((v_dy * v_dx) * v_load(dxydata + j), v_m2, v_s2); + + v_store(dstdata + j, v_s3 * v_factor); } } -#elif CV_NEON - for( ; j <= size.width - 4; j += 4 ) - { - float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j); - float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j))); - v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy)); - v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2)); - vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f)); - } #endif for( ; j < size.width; j++ )