From: Vitaly Tuzov Date: Thu, 13 Dec 2018 11:20:22 +0000 (+0300) Subject: Merge pull request #13334 from terfendail:histogram_wintr X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~376^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3903174f7c9475b51c4bc81063f744eb405a7d90;p=platform%2Fupstream%2Fopencv.git Merge pull request #13334 from terfendail:histogram_wintr * added performance test for compareHist * compareHist reworked to use wide universal intrinsics * Disabled vectorization for CV_COMP_CORREL and CV_COMP_BHATTACHARYYA if f64 is unsupported --- diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index af4efa2..19de221 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1125,6 +1125,12 @@ inline float v_reduce_sum(const v_float32x8& a) return _mm_cvtss_f32(s1); } +inline double v_reduce_sum(const v_float64x4& a) +{ + __m256d s0 = _mm256_hadd_pd(a.val, a.val); + return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0))); +} + inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, const v_float32x8& c, const v_float32x8& d) { diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 1b35896..608dc97 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -984,6 +984,13 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) +#if CV_SIMD128_64F +inline double v_reduce_sum(const v_float64x2& a) +{ + return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1); +} +#endif + inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 24a34a3..f7a67da 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1456,6 +1456,13 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) +inline double v_reduce_sum(const v_float64x2& a) +{ + double CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return idx[0] + idx[1]; +} + inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index efea72c..9506adf 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -716,6 +716,11 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min) +inline double v_reduce_sum(const v_float64x2& a) +{ + return vec_extract(vec_add(a.val, vec_sld(a.val, a.val, 8)), 0); +} + #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \ inline scalartype v_reduce_##suffix(const _Tpvec& a) \ { \ diff --git a/modules/imgproc/perf/perf_histogram.cpp b/modules/imgproc/perf/perf_histogram.cpp index eca97e3..4f54e94 100644 --- a/modules/imgproc/perf/perf_histogram.cpp +++ b/modules/imgproc/perf/perf_histogram.cpp @@ -116,6 +116,31 @@ PERF_TEST_P(MatSize, equalizeHist, } #undef MatSize +typedef TestBaseWithParam< tuple > Dim_Cmpmethod; +PERF_TEST_P(Dim_Cmpmethod, compareHist, + testing::Combine(testing::Values(1, 3), + testing::Values(HISTCMP_CORREL, HISTCMP_CHISQR, HISTCMP_INTERSECT, HISTCMP_BHATTACHARYYA, HISTCMP_CHISQR_ALT, HISTCMP_KL_DIV)) + ) +{ + int dims = get<0>(GetParam()); + int method = get<1>(GetParam()); + int histSize[] = { 2048, 128, 64 }; + + Mat hist1(dims, histSize, CV_32FC1); + Mat hist2(dims, histSize, CV_32FC1); + randu(hist1, 0, 256); + randu(hist2, 0, 256); + + declare.in(hist1.reshape(1, 256), hist2.reshape(1, 256)); + + TEST_CYCLE() + { + compareHist(hist1, hist2, method); + } + + SANITY_CHECK_NOTHING(); +} + typedef tuple Sz_ClipLimit_t; typedef TestBaseWithParam Sz_ClipLimit; diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 60cb363..a53a45e 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -41,6 +41,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" @@ -1938,10 +1939,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() ); -#if CV_SSE2 - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2); -#endif - for( size_t i = 0; i < it.nplanes; i++, ++it ) { const float* h1 = it.planes[0].ptr(); @@ -1961,50 +1958,63 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_CORREL ) { - #if CV_SSE2 - if (haveSIMD) +#if CV_SIMD_64F + v_float64 v_s1 = vx_setzero_f64(); + v_float64 v_s2 = vx_setzero_f64(); + v_float64 v_s11 = vx_setzero_f64(); + v_float64 v_s12 = vx_setzero_f64(); + v_float64 v_s22 = vx_setzero_f64(); + for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) { - __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1; - __m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1; - - for ( ; j <= len - 4; j += 4) - { - __m128 v_a = _mm_loadu_ps(h1 + j); - __m128 v_b = _mm_loadu_ps(h2 + j); - - // 0-1 - __m128d v_ad = _mm_cvtps_pd(v_a); - __m128d v_bd = _mm_cvtps_pd(v_b); - v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); - v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); - v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - - // 2-3 - v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); - v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); - v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); - v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); - v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - } - - double CV_DECL_ALIGNED(16) ar[10]; - _mm_store_pd(ar, v_s12); - _mm_store_pd(ar + 2, v_s11); - _mm_store_pd(ar + 4, v_s22); - _mm_store_pd(ar + 6, v_s1); - _mm_store_pd(ar + 8, v_s2); - - s12 += ar[0] + ar[1]; - s11 += ar[2] + ar[3]; - s22 += ar[4] + ar[5]; - s1 += ar[6] + ar[7]; - s2 += ar[8] + ar[9]; + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); + + // 0-1 + v_float64 v_ad = v_cvt_f64(v_a); + v_float64 v_bd = v_cvt_f64(v_b); + v_s12 = v_muladd(v_ad, v_bd, v_s12); + v_s11 = v_muladd(v_ad, v_ad, v_s11); + v_s22 = v_muladd(v_bd, v_bd, v_s22); + v_s1 += v_ad; + v_s2 += v_bd; + + // 2-3 + v_ad = v_cvt_f64_high(v_a); + v_bd = v_cvt_f64_high(v_b); + v_s12 = v_muladd(v_ad, v_bd, v_s12); + v_s11 = v_muladd(v_ad, v_ad, v_s11); + v_s22 = v_muladd(v_bd, v_bd, v_s22); + v_s1 += v_ad; + v_s2 += v_bd; } - #endif + s12 += v_reduce_sum(v_s12); + s11 += v_reduce_sum(v_s11); + s22 += v_reduce_sum(v_s22); + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); +#elif CV_SIMD && 0 //Disable vectorization for CV_COMP_CORREL if f64 is unsupported due to low precision + v_float32 v_s1 = vx_setzero_f32(); + v_float32 v_s2 = vx_setzero_f32(); + v_float32 v_s11 = vx_setzero_f32(); + v_float32 v_s12 = vx_setzero_f32(); + v_float32 v_s22 = vx_setzero_f32(); + for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); + + v_s12 = v_muladd(v_a, v_b, v_s12); + v_s11 = v_muladd(v_a, v_a, v_s11); + v_s22 = v_muladd(v_b, v_b, v_s22); + v_s1 += v_a; + v_s2 += v_b; + } + s12 += v_reduce_sum(v_s12); + s11 += v_reduce_sum(v_s11); + s22 += v_reduce_sum(v_s22); + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); +#endif for( ; j < len; j++ ) { double a = h1[j]; @@ -2019,67 +2029,68 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_INTERSECT ) { - #if CV_NEON - float32x4_t v_result = vdupq_n_f32(0.0f); - for( ; j <= len - 4; j += 4 ) - v_result = vaddq_f32(v_result, vminq_f32(vld1q_f32(h1 + j), vld1q_f32(h2 + j))); - float CV_DECL_ALIGNED(16) ar[4]; - vst1q_f32(ar, v_result); - result += ar[0] + ar[1] + ar[2] + ar[3]; - #elif CV_SSE2 - if (haveSIMD) +#if CV_SIMD_64F + v_float64 v_result = vx_setzero_f64(); + for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) { - __m128d v_result = _mm_setzero_pd(); - for ( ; j <= len - 4; j += 4) - { - __m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j), - _mm_loadu_ps(h2 + j)); - v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); - v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); - v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); - } - - double CV_DECL_ALIGNED(16) ar[2]; - _mm_store_pd(ar, v_result); - result += ar[0] + ar[1]; + v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j)); + v_result += v_cvt_f64(v_src) + v_cvt_f64_high(v_src); + } + result += v_reduce_sum(v_result); +#elif CV_SIMD + v_float32 v_result = vx_setzero_f32(); + for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j)); + v_result += v_src; } - #endif + result += v_reduce_sum(v_result); +#endif for( ; j < len; j++ ) result += std::min(h1[j], h2[j]); } else if( method == CV_COMP_BHATTACHARYYA ) { - #if CV_SSE2 - if (haveSIMD) +#if CV_SIMD_64F + v_float64 v_s1 = vx_setzero_f64(); + v_float64 v_s2 = vx_setzero_f64(); + v_float64 v_result = vx_setzero_f64(); + for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) { - __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1; - for ( ; j <= len - 4; j += 4) - { - __m128 v_a = _mm_loadu_ps(h1 + j); - __m128 v_b = _mm_loadu_ps(h2 + j); - - __m128d v_ad = _mm_cvtps_pd(v_a); - __m128d v_bd = _mm_cvtps_pd(v_b); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); - - v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); - v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); - } - - double CV_DECL_ALIGNED(16) ar[6]; - _mm_store_pd(ar, v_s1); - _mm_store_pd(ar + 2, v_s2); - _mm_store_pd(ar + 4, v_result); - s1 += ar[0] + ar[1]; - s2 += ar[2] + ar[3]; - result += ar[4] + ar[5]; + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); + + v_float64 v_ad = v_cvt_f64(v_a); + v_float64 v_bd = v_cvt_f64(v_b); + v_s1 += v_ad; + v_s2 += v_bd; + v_result += v_sqrt(v_ad * v_bd); + + v_ad = v_cvt_f64_high(v_a); + v_bd = v_cvt_f64_high(v_b); + v_s1 += v_ad; + v_s2 += v_bd; + v_result += v_sqrt(v_ad * v_bd); } - #endif + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); + result += v_reduce_sum(v_result); +#elif CV_SIMD && 0 //Disable vectorization for CV_COMP_BHATTACHARYYA if f64 is unsupported due to low precision + v_float32 v_s1 = vx_setzero_f32(); + v_float32 v_s2 = vx_setzero_f32(); + v_float32 v_result = vx_setzero_f32(); + for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); + v_s1 += v_a; + v_s2 += v_b; + v_result += v_sqrt(v_a * v_b); + } + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); + result += v_reduce_sum(v_result); +#endif for( ; j < len; j++ ) { double a = h1[j];