From 6b849906209629f02de10bf7fc0d1fbe26f85b71 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Fri, 21 Sep 2018 16:21:40 +0300 Subject: [PATCH] integral() implementation updated to utilize wide universal intrinsics --- modules/imgproc/src/sumpixels.cpp | 133 +++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 36 deletions(-) diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index c09e085..3c49aaf 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -43,6 +43,8 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" + namespace cv { @@ -60,15 +62,12 @@ struct Integral_SIMD } }; -#if CV_SSE2 +#if CV_SIMD && CV_SIMD_WIDTH <= 64 template <> struct Integral_SIMD { - Integral_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } + Integral_SIMD() {} bool operator()(const uchar * src, size_t _srcstep, int * sum, size_t _sumstep, @@ -76,15 +75,12 @@ struct Integral_SIMD int * tilted, size_t, int width, int height, int cn) const { - if (sqsum || tilted || cn != 1 || !haveSSE2) + if (sqsum || tilted || cn != 1) return false; // the first iteration memset(sum, 0, (width + 1) * sizeof(int)); - __m128i v_zero = _mm_setzero_si128(), prev = v_zero; - int j = 0; - // the others for (int i = 0; i < height; ++i) { @@ -94,48 +90,113 @@ struct Integral_SIMD sum_row[-1] = 0; - prev = v_zero; - j = 0; - - for ( ; j + 7 < width; j += 8) + v_int32 prev = vx_setzero_s32(); + int j = 0; + for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) { - __m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j)); - __m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4)); + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_int32 el4l, el4h; +#if CV_AVX2 + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val); + el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask)); + prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask); +#else + el8 += v_rotate_left<1>(el8); + el8 += v_rotate_left<2>(el8); +#if CV_SIMD_WIDTH == 32 + el8 += v_rotate_left<4>(el8); +#if CV_SIMD_WIDTH == 64 + el8 += v_rotate_left<8>(el8); +#endif +#endif + v_expand(el8, el4l, el4h); + el4l += prev; + el4h += el4l; + prev = vx_setall_s32(v_rotate_right(el4h).get0()); +#endif + v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes)); + } - __m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j)); - __m128i el8shr1 = _mm_slli_si128(el8shr0, 1); - __m128i el8shr2 = _mm_slli_si128(el8shr0, 2); - __m128i el8shr3 = _mm_slli_si128(el8shr0, 3); + for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) + sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; + } + vx_cleanup(); + + return true; + } +}; - vsuml = _mm_add_epi32(vsuml, prev); - vsumh = _mm_add_epi32(vsumh, prev); +template <> +struct Integral_SIMD +{ + Integral_SIMD() {} - __m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero), - _mm_unpacklo_epi8(el8shr2, v_zero)); - __m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero), - _mm_unpacklo_epi8(el8shr3, v_zero)); - __m128i el8 = _mm_add_epi16(el8shr12, el8shr03); + bool operator()(const uchar * src, size_t _srcstep, + float * sum, size_t _sumstep, + double * sqsum, size_t, + float * tilted, size_t, + int width, int height, int cn) const + { + if (sqsum || tilted || cn != 1) + return false; - __m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero), - _mm_unpacklo_epi16(el8, v_zero)); + // the first iteration + memset(sum, 0, (width + 1) * sizeof(int)); - vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero)); - vsumh = _mm_add_epi32(vsumh, el4h); + // the others + for (int i = 0; i < height; ++i) + { + const uchar * src_row = src + _srcstep * i; + float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + 1; + float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + 1; - _mm_storeu_si128((__m128i *)(sum_row + j), vsuml); - _mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh); + sum_row[-1] = 0; - prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3))); + v_float32 prev = vx_setzero_f32(); + int j = 0; + for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes) + { + v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j)); + v_float32 el4l, el4h; +#if CV_AVX2 + __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4)); + vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8)); + __m256i shmask = _mm256_set1_epi32(7); + el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val); + el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask)); + prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask); +#else + el8 += v_rotate_left<1>(el8); + el8 += v_rotate_left<2>(el8); +#if CV_SIMD_WIDTH == 32 + el8 += v_rotate_left<4>(el8); +#if CV_SIMD_WIDTH == 64 + el8 += v_rotate_left<8>(el8); +#endif +#endif + v_int32 el4li, el4hi; + v_expand(el8, el4li, el4hi); + el4l = v_cvt_f32(el4li) + prev; + el4h = v_cvt_f32(el4hi) + el4l; + prev = vx_setall_f32(v_rotate_right(el4h).get0()); +#endif + v_store(sum_row + j , el4l + vx_load(prev_sum_row + j )); + v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes)); } - for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) + for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j) sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; } + vx_cleanup(); return true; } - - bool haveSSE2; }; #endif -- 2.7.4