From 00c9ab8c23f1c6a0d48bbf97d588de990ee9ed36 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Thu, 29 Nov 2018 19:34:14 +0300 Subject: [PATCH] Merge pull request #13317 from terfendail:norm_wintr * Added performance tests for hal::norm functions * Added sum of absolute differences intrinsic * norm implementation updated to use wide universal intrinsics * improve and fix v_reduce_sad on VSX --- .../core/include/opencv2/core/hal/intrin_avx.hpp | 35 ++++++ .../core/include/opencv2/core/hal/intrin_cpp.hpp | 15 +++ .../core/include/opencv2/core/hal/intrin_neon.hpp | 43 ++++++++ .../core/include/opencv2/core/hal/intrin_sse.hpp | 35 ++++++ .../core/include/opencv2/core/hal/intrin_vsx.hpp | 44 ++++++++ modules/core/perf/perf_norm.cpp | 49 +++++++++ modules/core/src/norm.cpp | 118 +++------------------ modules/ts/include/opencv2/ts.hpp | 2 + 8 files changed, 237 insertions(+), 104 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 3037704..af4efa2 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1133,6 +1133,41 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, return v_float32x8(_mm256_hadd_ps(ab, cd)); } +inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b) +{ + return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val)); +} +inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) +{ + __m256i half = _mm256_set1_epi8(0x7f); + return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half))); +} +inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) +{ + v_uint32x8 l, h; + v_expand(v_add_wrap(a - b, b - a), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b) +{ + v_uint32x8 l, h; + v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b) +{ + return v_reduce_sum(v_max(a, b) - v_min(a, b)); +} +inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b) +{ + v_int32x8 m = a < b; + return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m)); +} +inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) +{ + return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); +} + /** Popcount **/ #define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \ inline v_uint32x8 v_popcount(const _Tpvec& a) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index d1f00a2..65a01f3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1063,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, return r; } +/** @brief Sum absolute differences of values + +Scheme: +@code +{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...} +@endcode +For all types except 64-bit types.*/ +template inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]); + for (int i = 1; i < n; i++) + c += _absdiff(a.s[i], b.s[i]); + return c; +} + /** @brief Get negative values mask Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 2de4e45..1b35896 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -999,6 +999,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, return v_float32x4(vaddq_f32(v0, v1)); } +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val))); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val)))); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val)); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val))); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + uint32x4_t t0 = vabdq_u32(a.val, b.val); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val)); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + float32x4_t t0 = vabdq_f32(a.val, b.val); + float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0)); + return vget_lane_f32(vpadd_f32(t1, t1), 0); +} + #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \ inline v_uint32x4 v_popcount(const _Tpvec& a) \ { \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 283c515..f9ac01d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1477,6 +1477,41 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val)); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + __m128i half = _mm_set1_epi8(0x7f); + return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half), + _mm_add_epi8(b.val, half))); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint32x4 l, h; + v_expand(v_absdiff(a, b), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + v_uint32x4 l, h; + v_expand(v_absdiff(a, b), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} + #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \ inline v_uint32x4 v_popcount(const _Tpvec& a) \ { \ diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index fe4a5db..efea72c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -739,6 +739,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, return v_float32x4(vec_mergeh(ac, bd)); } +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + const vec_uint4 zero4 = vec_uint4_z; + vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4); + return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + const vec_int4 zero4 = vec_int4_z; + vec_char16 ad = vec_abss(vec_subs(a.val, b.val)); + vec_int4 sum4 = vec_sum4s(ad, zero4); + return (unsigned)vec_extract(vec_sums(sum4, zero4), 3); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + vec_ushort8 ad = vec_absd(a.val, b.val); + VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad))); + return (unsigned)vec_extract(sum, 3); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + const vec_int4 zero4 = vec_int4_z; + vec_short8 ad = vec_abss(vec_subs(a.val, b.val)); + vec_int4 sum4 = vec_sum4s(ad, zero4); + return (unsigned)vec_extract(vec_sums(sum4, zero4), 3); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + const vec_uint4 ad = vec_absd(a.val, b.val); + const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8)); + return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + vec_int4 ad = vec_abss(vec_sub(a.val, b.val)); + return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + const vec_float4 ad = vec_abs(vec_sub(a.val, b.val)); + const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8)); + return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0); +} + /** Popcount **/ template inline v_uint32x4 v_popcount(const _Tpvec& a) diff --git a/modules/core/perf/perf_norm.cpp b/modules/core/perf/perf_norm.cpp index 4e06736..07f989f 100644 --- a/modules/core/perf/perf_norm.cpp +++ b/modules/core/perf/perf_norm.cpp @@ -253,4 +253,53 @@ PERF_TEST_P( Size_MatType, normalize_minmax, TYPICAL_MATS ) SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); } +typedef TestBaseWithParam< int > test_len; +PERF_TEST_P(test_len, hal_normL1_u8, + testing::Values(300000, 2000000) + ) +{ + int len = GetParam(); + + Mat src1(1, len, CV_8UC1); + Mat src2(1, len, CV_8UC1); + + declare.in(src1, src2, WARMUP_RNG); + double n; + TEST_CYCLE() n = hal::normL1_(src1.ptr(0), src2.ptr(0), len); + CV_UNUSED(n); + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P(test_len, hal_normL1_f32, + testing::Values(300000, 2000000) + ) +{ + int len = GetParam(); + + Mat src1(1, len, CV_32FC1); + Mat src2(1, len, CV_32FC1); + + declare.in(src1, src2, WARMUP_RNG); + double n; + TEST_CYCLE() n = hal::normL1_(src1.ptr(0), src2.ptr(0), len); + CV_UNUSED(n); + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P(test_len, hal_normL2Sqr, + testing::Values(300000, 2000000) + ) +{ + int len = GetParam(); + + Mat src1(1, len, CV_32FC1); + Mat src2(1, len, CV_32FC1); + + declare.in(src1, src2, WARMUP_RNG); + double n; + TEST_CYCLE() n = hal::normL2Sqr_(src1.ptr(0), src2.ptr(0), len); + CV_UNUSED(n); + SANITY_CHECK_NOTHING(); +} + } // namespace diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp index b2ea8d4..fd28f81 100644 --- a/modules/core/src/norm.cpp +++ b/modules/core/src/norm.cpp @@ -98,43 +98,15 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) float normL2Sqr_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_AVX2 - float CV_DECL_ALIGNED(32) buf[8]; - __m256 d0 = _mm256_setzero_ps(); - - for( ; j <= n - 8; j += 8 ) +#if CV_SIMD + v_float32 v_d = vx_setzero_f32(); + for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) { - __m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j)); -#if CV_FMA3 - d0 = _mm256_fmadd_ps(t0, t0, d0); -#else - d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0)); -#endif + v_float32 t = vx_load(a + j) - vx_load(b + j); + v_d = v_muladd(t, t, v_d); } - _mm256_store_ps(buf, d0); - d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]; -#elif CV_SSE - float CV_DECL_ALIGNED(16) buf[4]; - __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); - - for( ; j <= n - 8; j += 8 ) - { - __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); - __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); - d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); - d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); - } - _mm_store_ps(buf, _mm_add_ps(d0, d1)); - d = buf[0] + buf[1] + buf[2] + buf[3]; + d = v_reduce_sum(v_d); #endif - { - for( ; j <= n - 4; j += 4 ) - { - float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; - d += t0*t0 + t1*t1 + t2*t2 + t3*t3; - } - } - for( ; j < n; j++ ) { float t = a[j] - b[j]; @@ -147,38 +119,12 @@ float normL2Sqr_(const float* a, const float* b, int n) float normL1_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_SSE - float CV_DECL_ALIGNED(16) buf[4]; - static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; - __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); - __m128 absmask = _mm_load_ps((const float*)absbuf); - - for( ; j <= n - 8; j += 8 ) - { - __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); - __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); - d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); - d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); - } - _mm_store_ps(buf, _mm_add_ps(d0, d1)); - d = buf[0] + buf[1] + buf[2] + buf[3]; -#elif CV_NEON - float32x4_t v_sum = vdupq_n_f32(0.0f); - for ( ; j <= n - 4; j += 4) - v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); - - float CV_DECL_ALIGNED(16) buf[4]; - vst1q_f32(buf, v_sum); - d = buf[0] + buf[1] + buf[2] + buf[3]; +#if CV_SIMD + v_float32 v_d = vx_setzero_f32(); + for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) + v_d += v_absdiff(vx_load(a + j), vx_load(b + j)); + d = v_reduce_sum(v_d); #endif - { - for( ; j <= n - 4; j += 4 ) - { - d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + - std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); - } - } - for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; @@ -187,46 +133,10 @@ float normL1_(const float* a, const float* b, int n) int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; -#if CV_SSE - __m128i d0 = _mm_setzero_si128(); - - for( ; j <= n - 16; j += 16 ) - { - __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); - __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); - - d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); - } - - for( ; j <= n - 4; j += 4 ) - { - __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); - __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); - - d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); - } - d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); -#elif CV_NEON - uint32x4_t v_sum = vdupq_n_u32(0.0f); - for ( ; j <= n - 16; j += 16) - { - uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); - uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); - v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); - v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); - } - - uint CV_DECL_ALIGNED(16) buf[4]; - vst1q_u32(buf, v_sum); - d = buf[0] + buf[1] + buf[2] + buf[3]; +#if CV_SIMD + for (; j <= n - v_uint8::nlanes; j += v_uint8::nlanes) + d += v_reduce_sad(vx_load(a + j), vx_load(b + j)); #endif - { - for( ; j <= n - 4; j += 4 ) - { - d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + - std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); - } - } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp index da9cfca..3ccff0e 100644 --- a/modules/ts/include/opencv2/ts.hpp +++ b/modules/ts/include/opencv2/ts.hpp @@ -17,6 +17,8 @@ #include "opencv2/core/utils/trace.hpp" +#include "opencv2/core/hal/hal.hpp" + #include // for va_list #include "cvconfig.h" -- 2.7.4