NEON impl on cv::convertScaleAba CV_32f

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Sun, 31 Aug 2014 10:54:12 +0000 (10:54 +0000)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Mon, 1 Sep 2014 17:04:36 +0000 (17:04 +0000)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Sun, 31 Aug 2014 10:54:12 +0000 (10:54 +0000)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 1 Sep 2014 17:04:36 +0000 (17:04 +0000)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index ef88820..96b3f8e 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,7 +130,7 @@ OCV_OPTION(WITH_GSTREAMER      "Include Gstreamer support"                   ON
  OCV_OPTION(WITH_GSTREAMER_0_10 "Enable Gstreamer 0.10 support (instead of 1.x)"                              OFF )
  OCV_OPTION(WITH_GTK            "Include GTK support"                         ON   IF (UNIX AND NOT APPLE AND NOT ANDROID) )
  OCV_OPTION(WITH_GTK_2_X        "Use GTK version 2"                           OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
-OCV_OPTION(WITH_IPP            "Include Intel IPP support"                   ON   IF (NOT IOS) )
+OCV_OPTION(WITH_IPP            "Include Intel IPP support"                   ON   IF (X86_64 OR X86) )
  OCV_OPTION(WITH_JASPER         "Include JPEG2K support"                      ON   IF (NOT IOS) )
  OCV_OPTION(WITH_JPEG           "Include JPEG support"                        ON)
  OCV_OPTION(WITH_WEBP           "Include WebP support"                        ON   IF (NOT IOS) )
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp

index 49bfb7d..f835e1b 100644 (file)
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -173,7 +173,7 @@ split_( const T* src, T** dst, int len, int cn )
              int inc_j = 3 * inc_i;
  
              VSplit3<T> vsplit;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
                  vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
          }
  #endif
@@ -196,7 +196,7 @@ split_( const T* src, T** dst, int len, int cn )
              int inc_j = 4 * inc_i;
  
              VSplit4<T> vsplit;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
                  vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
          }
  #endif
@@ -1076,7 +1076,7 @@ namespace cv
  {
  
  template<typename T, typename DT, typename WT>
-struct cvtScaleAbs_SSE2
+struct cvtScaleAbs_SIMD
  {
      int operator () (const T *, DT *, int, WT, WT) const
      {
@@ -1087,7 +1087,7 @@ struct cvtScaleAbs_SSE2
  #if CV_SSE2
  
  template <>
-struct cvtScaleAbs_SSE2<uchar, uchar, float>
+struct cvtScaleAbs_SIMD<uchar, uchar, float>
  {
      int operator () (const uchar * src, uchar * dst, int width,
                       float scale, float shift) const
@@ -1124,7 +1124,7 @@ struct cvtScaleAbs_SSE2<uchar, uchar, float>
  };
  
  template <>
-struct cvtScaleAbs_SSE2<ushort, uchar, float>
+struct cvtScaleAbs_SIMD<ushort, uchar, float>
  {
      int operator () (const ushort * src, uchar * dst, int width,
                       float scale, float shift) const
@@ -1155,7 +1155,7 @@ struct cvtScaleAbs_SSE2<ushort, uchar, float>
  };
  
  template <>
-struct cvtScaleAbs_SSE2<short, uchar, float>
+struct cvtScaleAbs_SIMD<short, uchar, float>
  {
      int operator () (const short * src, uchar * dst, int width,
                       float scale, float shift) const
@@ -1186,7 +1186,7 @@ struct cvtScaleAbs_SSE2<short, uchar, float>
  };
  
  template <>
-struct cvtScaleAbs_SSE2<int, uchar, float>
+struct cvtScaleAbs_SIMD<int, uchar, float>
  {
      int operator () (const int * src, uchar * dst, int width,
                       float scale, float shift) const
@@ -1215,7 +1215,7 @@ struct cvtScaleAbs_SSE2<int, uchar, float>
  };
  
  template <>
-struct cvtScaleAbs_SSE2<float, uchar, float>
+struct cvtScaleAbs_SIMD<float, uchar, float>
  {
      int operator () (const float * src, uchar * dst, int width,
                       float scale, float shift) const
@@ -1242,6 +1242,35 @@ struct cvtScaleAbs_SSE2<float, uchar, float>
      }
  };
  
+#elif CV_NEON
+
+template <>
+struct cvtScaleAbs_SIMD<float, uchar, float>
+{
+    int operator () (const float * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
+            v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
+            uint16x4_t v_dsti_0 = vqmovun_s32(vcvtq_s32_f32(v_dst_0));
+
+            float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
+            v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
+            uint16x4_t v_dsti_1 = vqmovun_s32(vcvtq_s32_f32(v_dst_1));
+
+            uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
  #endif
  
  template<typename T, typename DT, typename WT> static void
@@ -1251,7 +1280,7 @@ cvtScaleAbs_( const T* src, size_t sstep,
  {
      sstep /= sizeof(src[0]);
      dstep /= sizeof(dst[0]);
-    cvtScaleAbs_SSE2<T, DT, WT> vop;
+    cvtScaleAbs_SIMD<T, DT, WT> vop;
  
      for( ; size.height--; src += sstep, dst += dstep )
      {
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Sun, 31 Aug 2014 10:54:12 +0000 (10:54 +0000)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Mon, 1 Sep 2014 17:04:36 +0000 (17:04 +0000)
CMakeLists.txt		patch \| blob \| history
modules/core/src/convert.cpp		patch \| blob \| history