From 00f16e917846c031fff893576c12879b852a770e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 22 Sep 2014 23:39:34 +0400
Subject: [PATCH] neon

---
 modules/core/src/convert.cpp   | 36 ++++++++++++++++++++++++++++++++++++
 modules/core/src/copy.cpp      | 19 +++++++++++++++++++
 modules/core/src/mathfuncs.cpp | 29 ++++++++++++++++++++++++++---
 3 files changed, 81 insertions(+), 3 deletions(-)
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 0aecb69..1eeb0ec 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1541,6 +1541,20 @@ cvtScale_<short, short, float>( const short* src, size_t sstep,
                     _mm_storeu_si128((__m128i*)(dst + x), r0);
                 }
             }
+        #elif CV_NEON
+        float32x4_t v_shift = vdupq_n_f32(shift);
+        for(; x <= size.width - 8; x += 8 )
+        {
+            int16x8_t v_src = vld1q_s16(src + x);
+            float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
+            float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
+
+            v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
+            v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
+        }
         #endif
 
         for(; x < size.width; x++ )
@@ -1580,6 +1594,20 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
                     _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
                 }
             }
+        #elif CV_NEON
+        float32x4_t v_shift = vdupq_n_f32(shift);
+        for(; x <= size.width - 8; x += 8 )
+        {
+            int16x8_t v_src = vld1q_s16(src + x);
+            float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
+            float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
+
+            v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
+            v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
+
+            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
+            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
+        }
         #endif
 
         //We will wait Haswell
@@ -2134,6 +2162,14 @@ cvt_<float, short>( const float* src, size_t sstep,
                 _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
             }
         }
+        #elif CV_NEON
+        for( ; x <= size.width - 8; x += 8 )
+        {
+            float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
+            vst1q_s16(dst + x, v_dst);
+        }
         #endif
         for( ; x < size.width; x++ )
             dst[x] = saturate_cast<short>(src[x]);
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 484a39a..7ffb8c2 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -107,6 +107,14 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
                  _mm_storeu_si128((__m128i*)(dst + x), rDst);
              }
         }
+        #elif CV_NEON
+        uint8x16_t v_zero = vdupq_n_u8(0);
+        for( ; x <= size.width - 16; x += 16 )
+        {
+            uint8x16_t v_mask = vcgtq_u8(vld1q_u8(mask + x), v_zero);
+            uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x);
+            vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst));
+        }
         #endif
         for( ; x < size.width; x++ )
             if( mask[x] )
@@ -143,6 +151,17 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
                  _mm_storeu_si128((__m128i*)(dst + x), rDst);
              }
         }
+        #elif CV_NEON
+        uint8x8_t v_zero = vdup_n_u8(0);
+        for( ; x <= size.width - 8; x += 8 )
+        {
+            uint8x8_t v_mask = vcgt_u8(vld1_u8(mask + x), v_zero);
+            uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask);
+            uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1]));
+
+            uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x);
+            vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst));
+        }
         #endif
         for( ; x < size.width; x++ )
             if( mask[x] )
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 45161f9..e240e57 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -261,6 +261,19 @@ static void Magnitude_32f(const float* x, const float* y, float* mag, int len)
             _mm_storeu_ps(mag + i, x0); _mm_storeu_ps(mag + i + 4, x1);
         }
     }
+#elif CV_NEON
+    float CV_DECL_ALIGNED(16) m[4];
+
+    for( ; i <= len - 4; i += 4 )
+    {
+        float32x4_t v_x = vld1q_f32(x + i), v_y = vld1q_f32(y + i);
+        vst1q_f32(m, vaddq_f32(vmulq_f32(v_x, v_x), vmulq_f32(v_y, v_y)));
+
+        mag[i] = std::sqrt(m[0]);
+        mag[i+1] = std::sqrt(m[1]);
+        mag[i+2] = std::sqrt(m[2]);
+        mag[i+3] = std::sqrt(m[3]);
+    }
 #endif
 
     for( ; i < len; i++ )
@@ -2554,12 +2567,14 @@ void patchNaNs( InputOutputArray _a, double _val )
     NAryMatIterator it(arrays, (uchar**)ptrs);
     size_t len = it.size*a.channels();
     Cv32suf val;
-    float fval = (float)_val;
-    val.f = fval;
+    val.f = (float)_val;
 
 #if CV_SSE2
     __m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000);
     __m128i v_val = _mm_set1_epi32(val.i);
+#elif CV_NEON
+    int32x4_t v_mask1 = vdupq_n_s32(0x7fffffff), v_mask2 = vdupq_n_s32(0x7f800000),
+        v_val = vdupq_n_s32(val.i);
 #endif
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -2570,7 +2585,7 @@ void patchNaNs( InputOutputArray _a, double _val )
 #if CV_SSE2
         if (USE_SSE2)
         {
-            for ( ; j < len; j += 4)
+            for ( ; j + 4 <= len; j += 4)
             {
                 __m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j));
                 __m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1));
@@ -2578,6 +2593,14 @@ void patchNaNs( InputOutputArray _a, double _val )
                 _mm_storeu_si128((__m128i *)(tptr + j), v_res);
             }
         }
+#elif CV_NEON
+        for ( ; j + 4 <= len; j += 4)
+        {
+            int32x4_t v_src = vld1q_s32(tptr + j);
+            uint32x4_t v_cmp_mask = vcltq_s32(v_mask2, vandq_s32(v_src, v_mask1));
+            int32x4_t v_dst = vbslq_s32(v_cmp_mask, v_val, v_src);
+            vst1q_s32(tptr + j, v_dst);
+        }
 #endif
 
         for( ; j < len; j++ )
-- 
2.7.4