cv::moments (CV_8UC1)

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Fri, 26 Sep 2014 14:39:04 +0000 (14:39 +0000)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Fri, 26 Sep 2014 14:39:04 +0000 (14:39 +0000)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Fri, 26 Sep 2014 14:39:04 +0000 (14:39 +0000)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Fri, 26 Sep 2014 14:39:04 +0000 (14:39 +0000)
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp

index d68248c..b292d99 100644 (file)
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -203,7 +203,7 @@ static Moments contourMoments( const Mat& contour )
  \****************************************************************************************/
  
  template<typename T, typename WT, typename MT>
-struct MomentsInTile_SSE
+struct MomentsInTile_SIMD
  {
      int operator() (const T *, int, WT &, WT &, WT &, MT &)
      {
@@ -214,9 +214,9 @@ struct MomentsInTile_SSE
  #if CV_SSE2
  
  template <>
-struct MomentsInTile_SSE<uchar, int, int>
+struct MomentsInTile_SIMD<uchar, int, int>
  {
-    MomentsInTile_SSE()
+    MomentsInTile_SIMD()
      {
          useSIMD = checkHardwareSupport(CV_CPU_SSE2);
      }
@@ -234,17 +234,16 @@ struct MomentsInTile_SSE<uchar, int, int>
              for( ; x <= len - 8; x += 8 )
              {
                  __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
-                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
-                __m128i px = _mm_mullo_epi16(p, qx);
                  __m128i sx = _mm_mullo_epi16(qx, qx);
+
+                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                  qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                  qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
-                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));
+                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));
  
                  qx = _mm_add_epi16(qx, dx);
              }
  
-            int CV_DECL_ALIGNED(16) buf[4];
              _mm_store_si128((__m128i*)buf, qx0);
              x0 = buf[0] + buf[1] + buf[2] + buf[3];
              _mm_store_si128((__m128i*)buf, qx1);
@@ -258,17 +257,84 @@ struct MomentsInTile_SSE<uchar, int, int>
          return x;
      }
  
+    int CV_DECL_ALIGNED(16) buf[4];
      bool useSIMD;
  };
  
+#elif CV_NEON
+
+template <>
+struct MomentsInTile_SIMD<uchar, int, int>
+{
+    MomentsInTile_SIMD()
+    {
+        ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 };
+        qx_init = vld1_u16(init);
+        v_step = vdup_n_u16(4);
+    }
+
+    int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
+    {
+        int x = 0;
+
+        uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z,
+            v_x2 = v_z, v_x3 = v_z;
+        uint16x4_t qx = qx_init;
+
+        for( ; x <= len - 8; x += 8 )
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x));
+
+            // first part
+            uint32x4_t v_qx = vmovl_u16(qx);
+            uint16x4_t v_p = vget_low_u16(v_src);
+            uint32x4_t v_px = vmull_u16(qx, v_p);
+
+            v_x0 = vaddw_u16(v_x0, v_p);
+            v_x1 = vaddq_u32(v_x1, v_px);
+            v_px = vmulq_u32(v_px, v_qx);
+            v_x2 = vaddq_u32(v_x2, v_px);
+            v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx));
+            qx = vadd_u16(qx, v_step);
+
+            // second part
+            v_qx = vmovl_u16(qx);
+            v_p = vget_high_u16(v_src);
+            v_px = vmull_u16(qx, v_p);
+
+            v_x0 = vaddw_u16(v_x0, v_p);
+            v_x1 = vaddq_u32(v_x1, v_px);
+            v_px = vmulq_u32(v_px, v_qx);
+            v_x2 = vaddq_u32(v_x2, v_px);
+            v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx));
+
+            qx = vadd_u16(qx, v_step);
+        }
+
+        vst1q_u32(buf, v_x0);
+        x0 = buf[0] + buf[1] + buf[2] + buf[3];
+        vst1q_u32(buf, v_x1);
+        x1 = buf[0] + buf[1] + buf[2] + buf[3];
+        vst1q_u32(buf, v_x2);
+        x2 = buf[0] + buf[1] + buf[2] + buf[3];
+        vst1q_u32(buf, v_x3);
+        x3 = buf[0] + buf[1] + buf[2] + buf[3];
+
+        return x;
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    uint16x4_t qx_init, v_step;
+};
+
  #endif
  
  #if CV_SSE4_1
  
  template <>
-struct MomentsInTile_SSE<ushort, int, int64>
+struct MomentsInTile_SIMD<ushort, int, int64>
  {
-    MomentsInTile_SSE()
+    MomentsInTile_SIMD()
      {
          useSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
      }
@@ -302,9 +368,6 @@ struct MomentsInTile_SSE<ushort, int, int64>
                  v_ix1 = _mm_add_epi32(v_ix1, v_delta);
              }
  
-            int CV_DECL_ALIGNED(16) buf[4];
-            int64 CV_DECL_ALIGNED(16) buf64[2];
-
              _mm_store_si128((__m128i*)buf, v_x0);
              x0 = buf[0] + buf[1] + buf[2] + buf[3];
              _mm_store_si128((__m128i*)buf, v_x1);
@@ -319,6 +382,8 @@ struct MomentsInTile_SSE<ushort, int, int64>
          return x;
      }
  
+    int CV_DECL_ALIGNED(16) buf[4];
+    int64 CV_DECL_ALIGNED(16) buf64[2];
      bool useSIMD;
  };
  
@@ -334,7 +399,7 @@ static void momentsInTile( const Mat& img, double* moments )
      Size size = img.size();
      int x, y;
      MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
-    MomentsInTile_SSE<T, WT, MT> vop;
+    MomentsInTile_SIMD<T, WT, MT> vop;
  
      for( y = 0; y < size.height; y++ )
      {
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Fri, 26 Sep 2014 14:39:04 +0000 (14:39 +0000)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Fri, 26 Sep 2014 14:39:04 +0000 (14:39 +0000)