SSE2 optimization of cv::integral
authorIlya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 29 Dec 2014 12:38:49 +0000 (15:38 +0300)
committerIlya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 29 Dec 2014 12:51:55 +0000 (15:51 +0300)
modules/imgproc/src/sumpixels.cpp

index 0778bdc..cdef88f 100755 (executable)
@@ -50,6 +50,99 @@ static IppStatus sts = ippInit();
 namespace cv
 {
 
+template <typename T, typename ST, typename QT>
+struct Integral_SIMD
+{
+    bool operator()(const T *, size_t,
+                    ST *, size_t,
+                    QT *, size_t,
+                    ST *, size_t,
+                    Size, int) const
+    {
+        return false;
+    }
+};
+
+#if CV_SSE2
+
+template <>
+struct Integral_SIMD<uchar, int, double>
+{
+    Integral_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    bool operator()(const uchar * src, size_t _srcstep,
+                    int * sum, size_t _sumstep,
+                    double * sqsum, size_t,
+                    int * tilted, size_t,
+                    Size size, int cn) const
+    {
+        if (sqsum || tilted || cn != 1 || !haveSSE2)
+            return false;
+
+        // the first iteration
+        memset(sum, 0, (size.width + 1) * sizeof(int));
+
+        __m128i v_zero = _mm_setzero_si128(), prev = v_zero;
+        int j = 0;
+
+        // the others
+        for (int i = 0; i < size.height; ++i)
+        {
+            const uchar * src_row = src + _srcstep * i;
+            int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1;
+            int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1;
+
+            sum_row[-1] = 0;
+
+            prev = v_zero;
+            j = 0;
+
+            for ( ; j + 7 < size.width; j += 8)
+            {
+                __m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j));
+                __m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4));
+
+                __m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j));
+                __m128i el8shr1 = _mm_slli_si128(el8shr0, 1);
+                __m128i el8shr2 = _mm_slli_si128(el8shr0, 2);
+                __m128i el8shr3 = _mm_slli_si128(el8shr0, 3);
+
+                vsuml = _mm_add_epi32(vsuml, prev);
+                vsumh = _mm_add_epi32(vsumh, prev);
+
+                __m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero),
+                                                 _mm_unpacklo_epi8(el8shr2, v_zero));
+                __m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero),
+                                                 _mm_unpacklo_epi8(el8shr3, v_zero));
+                __m128i el8 = _mm_add_epi16(el8shr12, el8shr03);
+
+                __m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero),
+                                             _mm_unpacklo_epi16(el8, v_zero));
+
+                vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero));
+                vsumh = _mm_add_epi32(vsumh, el4h);
+
+                _mm_storeu_si128((__m128i *)(sum_row + j), vsuml);
+                _mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh);
+
+                prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3)));
+            }
+
+            for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < size.width; ++j)
+                sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
+        }
+
+        return true;
+    }
+
+    bool haveSSE2;
+};
+
+#endif
+
 template<typename T, typename ST, typename QT>
 void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
                 QT* sqsum, size_t _sqsumstep, ST* tilted, size_t _tiltedstep,
@@ -57,6 +150,13 @@ void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
 {
     int x, y, k;
 
+    if (Integral_SIMD<T, ST, QT>()(src, _srcstep,
+                                   sum, _sumstep,
+                                   sqsum, _sqsumstep,
+                                   tilted, _tiltedstep,
+                                   size, cn))
+        return;
+
     int srcstep = (int)(_srcstep/sizeof(T));
     int sumstep = (int)(_sumstep/sizeof(ST));
     int tiltedstep = (int)(_tiltedstep/sizeof(ST));