From 8fdbdb131d3b2bb6de6d0bda1808a06a5f5114b8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 2 Jul 2014 01:32:45 +0400
Subject: [PATCH] SSE4.1 optimiation of cv::Moments CV_16U

---
 modules/imgproc/src/moments.cpp | 161 +++++++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 58 deletions(-)
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index 61fff29..a61002a 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -203,69 +203,27 @@ static Moments contourMoments( const Mat& contour )
 \****************************************************************************************/
 
 template<typename T, typename WT, typename MT>
-#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 9
-// Workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60196
-__attribute__((optimize("no-tree-vectorize")))
-#endif
-static void momentsInTile( const Mat& img, double* moments )
+struct MomentsInTile_SSE
 {
-    Size size = img.size();
-    int x, y;
-    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
-
-    for( y = 0; y < size.height; y++ )
+    int operator() (const T *, int, WT &, WT &, WT &, MT &)
     {
-        const T* ptr = (const T*)(img.data + y*img.step);
-        WT x0 = 0, x1 = 0, x2 = 0;
-        MT x3 = 0;
-
-        for( x = 0; x < size.width; x++ )
-        {
-            WT p = ptr[x];
-            WT xp = x * p, xxp;
-
-            x0 += p;
-            x1 += xp;
-            xxp = xp * x;
-            x2 += xxp;
-            x3 += xxp * x;
-        }
-
-        WT py = y * x0, sy = y*y;
-
-        mom[9] += ((MT)py) * sy;  // m03
-        mom[8] += ((MT)x1) * sy;  // m12
-        mom[7] += ((MT)x2) * y;  // m21
-        mom[6] += x3;             // m30
-        mom[5] += x0 * sy;        // m02
-        mom[4] += x1 * y;         // m11
-        mom[3] += x2;             // m20
-        mom[2] += py;             // m01
-        mom[1] += x1;             // m10
-        mom[0] += x0;             // m00
+        return 0;
     }
-
-    for( x = 0; x < 10; x++ )
-        moments[x] = (double)mom[x];
-}
-
+};
 
 #if CV_SSE2
 
-template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments )
+template <>
+struct MomentsInTile_SSE<uchar, int, int>
 {
-    typedef uchar T;
-    typedef int WT;
-    typedef int MT;
-    Size size = img.size();
-    int y;
-    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
-    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    MomentsInTile_SSE()
+    {
+        useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
 
-    for( y = 0; y < size.height; y++ )
+    int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
     {
-        const T* ptr = img.ptr<T>(y);
-        int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0;
+        int x = 0;
 
         if( useSIMD )
         {
@@ -273,7 +231,7 @@ template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* mome
             __m128i dx = _mm_set1_epi16(8);
             __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;
 
-            for( ; x <= size.width - 8; x += 8 )
+            for( ; x <= len - 8; x += 8 )
             {
                 __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                 qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
@@ -285,6 +243,7 @@ template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* mome
 
                 qx = _mm_add_epi16(qx, dx);
             }
+
             int CV_DECL_ALIGNED(16) buf[4];
             _mm_store_si128((__m128i*)buf, qx0);
             x0 = buf[0] + buf[1] + buf[2] + buf[3];
@@ -296,6 +255,94 @@ template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* mome
             x3 = buf[0] + buf[1] + buf[2] + buf[3];
         }
 
+        return x;
+    }
+
+    bool useSIMD;
+};
+
+#endif
+
+#if CV_SSE4_1
+
+template <>
+struct MomentsInTile_SSE<ushort, int, int64>
+{
+    MomentsInTile_SSE()
+    {
+        useSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * ptr, int len, int & x0, int & x1, int & x2, int64 & x3)
+    {
+        int x = 0;
+
+        if (useSIMD)
+        {
+            __m128i vx_init0 = _mm_setr_epi32(0, 1, 2, 3), vx_init1 = _mm_setr_epi32(4, 5, 6, 7),
+                v_delta = _mm_set1_epi32(8), v_zero = _mm_setzero_si128(), v_x0 = v_zero,
+                v_x1 = v_zero, v_x2 = v_zero, v_x3 = v_zero, v_ix0 = vx_init0, v_ix1 = vx_init1;
+
+            for( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(ptr + x));
+                __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_zero), v_src1 = _mm_unpackhi_epi16(v_src, v_zero);
+
+                v_x0 = _mm_add_epi32(v_x0, _mm_add_epi32(v_src0, v_src1));
+                __m128i v_x1_0 = _mm_mullo_epi32(v_src0, v_ix0), v_x1_1 = _mm_mullo_epi32(v_src1, v_ix1);
+                v_x1 = _mm_add_epi32(v_x1, _mm_add_epi32(v_x1_0, v_x1_1));
+
+                __m128i v_2ix0 = _mm_mullo_epi32(v_ix0, v_ix0), v_2ix1 = _mm_mullo_epi32(v_ix1, v_ix1);
+                v_x2 = _mm_add_epi32(v_x2, _mm_add_epi32(_mm_mullo_epi32(v_2ix0, v_src0), _mm_mullo_epi32(v_2ix1, v_src1)));
+
+                __m128i t = _mm_add_epi32(_mm_mullo_epi32(v_2ix0, v_x1_0), _mm_mullo_epi32(v_2ix1, v_x1_1));
+                v_x3 = _mm_add_epi64(v_x3, _mm_add_epi64(_mm_unpacklo_epi32(t, v_zero), _mm_unpackhi_epi32(t, v_zero)));
+
+                v_ix0 = _mm_add_epi32(v_ix0, v_delta);
+                v_ix1 = _mm_add_epi32(v_ix1, v_delta);
+            }
+
+            int CV_DECL_ALIGNED(16) buf[4];
+            int64 CV_DECL_ALIGNED(16) buf64[2];
+
+            _mm_store_si128((__m128i*)buf, v_x0);
+            x0 = buf[0] + buf[1] + buf[2] + buf[3];
+            _mm_store_si128((__m128i*)buf, v_x1);
+            x1 = buf[0] + buf[1] + buf[2] + buf[3];
+            _mm_store_si128((__m128i*)buf, v_x2);
+            x2 = buf[0] + buf[1] + buf[2] + buf[3];
+
+            _mm_store_si128((__m128i*)buf64, v_x3);
+            x3 = buf64[0] + buf64[1];
+        }
+
+        return x;
+    }
+
+    bool useSIMD;
+};
+
+#endif
+
+template<typename T, typename WT, typename MT>
+#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 9
+// Workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60196
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+static void momentsInTile( const Mat& img, double* moments )
+{
+    Size size = img.size();
+    int x, y;
+    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
+    MomentsInTile_SSE<T, WT, MT> vop;
+
+    for( y = 0; y < size.height; y++ )
+    {
+        const T* ptr = (const T*)(img.data + y*img.step);
+        WT x0 = 0, x1 = 0, x2 = 0;
+        MT x3 = 0;
+        x = vop(ptr, size.width, x0, x1, x2, x3);
+
         for( ; x < size.width; x++ )
         {
             WT p = ptr[x];
@@ -322,12 +369,10 @@ template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* mome
         mom[0] += x0;             // m00
     }
 
-    for(int x = 0; x < 10; x++ )
+    for( x = 0; x < 10; x++ )
         moments[x] = (double)mom[x];
 }
 
-#endif
-
 typedef void (*MomentsInTileFunc)(const Mat& img, double* moments);
 
 Moments::Moments()
-- 
2.7.4