From: Ilya Lavrenov Date: Mon, 29 Dec 2014 12:38:49 +0000 (+0300) Subject: SSE2 optimization of cv::integral X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~2757^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9436103ed674851cfe842e90cb388ad72a12d573;p=platform%2Fupstream%2Fopencv.git SSE2 optimization of cv::integral --- diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index 0778bdc..cdef88f 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -50,6 +50,99 @@ static IppStatus sts = ippInit(); namespace cv { +template +struct Integral_SIMD +{ + bool operator()(const T *, size_t, + ST *, size_t, + QT *, size_t, + ST *, size_t, + Size, int) const + { + return false; + } +}; + +#if CV_SSE2 + +template <> +struct Integral_SIMD +{ + Integral_SIMD() + { + haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + } + + bool operator()(const uchar * src, size_t _srcstep, + int * sum, size_t _sumstep, + double * sqsum, size_t, + int * tilted, size_t, + Size size, int cn) const + { + if (sqsum || tilted || cn != 1 || !haveSSE2) + return false; + + // the first iteration + memset(sum, 0, (size.width + 1) * sizeof(int)); + + __m128i v_zero = _mm_setzero_si128(), prev = v_zero; + int j = 0; + + // the others + for (int i = 0; i < size.height; ++i) + { + const uchar * src_row = src + _srcstep * i; + int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1; + int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1; + + sum_row[-1] = 0; + + prev = v_zero; + j = 0; + + for ( ; j + 7 < size.width; j += 8) + { + __m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j)); + __m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4)); + + __m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j)); + __m128i el8shr1 = _mm_slli_si128(el8shr0, 1); + __m128i el8shr2 = _mm_slli_si128(el8shr0, 2); + __m128i el8shr3 = _mm_slli_si128(el8shr0, 3); + + vsuml = _mm_add_epi32(vsuml, prev); + vsumh = _mm_add_epi32(vsumh, prev); + + __m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero), + _mm_unpacklo_epi8(el8shr2, v_zero)); + __m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero), + _mm_unpacklo_epi8(el8shr3, v_zero)); + __m128i el8 = _mm_add_epi16(el8shr12, el8shr03); + + __m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero), + _mm_unpacklo_epi16(el8, v_zero)); + + vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero)); + vsumh = _mm_add_epi32(vsumh, el4h); + + _mm_storeu_si128((__m128i *)(sum_row + j), vsuml); + _mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh); + + prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3))); + } + + for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < size.width; ++j) + sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; + } + + return true; + } + + bool haveSSE2; +}; + +#endif + template void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep, QT* sqsum, size_t _sqsumstep, ST* tilted, size_t _tiltedstep, @@ -57,6 +150,13 @@ void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep, { int x, y, k; + if (Integral_SIMD()(src, _srcstep, + sum, _sumstep, + sqsum, _sqsumstep, + tilted, _tiltedstep, + size, cn)) + return; + int srcstep = (int)(_srcstep/sizeof(T)); int sumstep = (int)(_sumstep/sizeof(ST)); int tiltedstep = (int)(_tiltedstep/sizeof(ST));