From fd0ac962fbccce6f886d8c14e1a47dbe5b37d2a2 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Sat, 23 Dec 2017 08:59:24 +0200
Subject: [PATCH] core: replace raw intrinsics with universal intrinsics in
 copy.cpp

- use universal intrinsic instead of raw intrinsic
- add performance check for Mat::copyTo/setTo with mask
---
 modules/core/perf/perf_mat.cpp | 43 ++++++++++++++++++-
 modules/core/src/copy.cpp      | 93 +++++++++++++++++++++---------------------
 2 files changed, 89 insertions(+), 47 deletions(-)
diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp
index 7066c5b..fa87709 100644
--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@@ -57,7 +57,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone,
     Size size = get<0>(GetParam());
     int type = get<1>(GetParam());
     Mat source(size.height, size.width, type);
-    Mat destination(size.height, size.width, type);;
+    Mat destination(size.height, size.width, type);
 
     declare.in(source, WARMUP_RNG).out(destination);
 
@@ -97,6 +97,47 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
     SANITY_CHECK(destination, 1);
 }
 
+PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
+            testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
+                             testing::Values(CV_8UC1, CV_8UC2))
+            )
+{
+    const Size_MatType_t params = GetParam();
+    const Size size = get<0>(params);
+    const int type = get<1>(params);
+
+    Mat src(size, type), dst(size, type), mask(size, CV_8UC1);
+    declare.in(src, mask, WARMUP_RNG).out(dst);
+
+    TEST_CYCLE()
+    {
+        src.copyTo(dst, mask);
+    }
+
+    SANITY_CHECK(dst);
+}
+
+PERF_TEST_P(Size_MatType, Mat_SetToWithMask,
+            testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
+                             testing::Values(CV_8UC1, CV_8UC2))
+            )
+{
+    const Size_MatType_t params = GetParam();
+    const Size size = get<0>(params);
+    const int type = get<1>(params);
+    const Scalar sc = Scalar::all(27);
+
+    Mat src(size, type), mask(size, CV_8UC1);
+    declare.in(src, mask, WARMUP_RNG).out(src);
+
+    TEST_CYCLE()
+    {
+        src.setTo(sc, mask);
+    }
+
+    SANITY_CHECK(src);
+}
+
 ///////////// Transform ////////////////////////
 
 PERF_TEST_P(Size_MatType, Mat_Transform,
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 49c9cb7..554256d 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -90,28 +90,27 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
         const uchar* src = (const uchar*)_src;
         uchar* dst = (uchar*)_dst;
         int x = 0;
-        #if CV_SSE4_2
-        if(USE_SSE4_2)//
-        {
-            __m128i zero = _mm_setzero_si128 ();
-
-             for( ; x <= size.width - 16; x += 16 )
-             {
-                 const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
-                 __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x));
-                 __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x));
-                 __m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
-                 rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
-                 _mm_storeu_si128((__m128i*)(dst + x), rDst);
-             }
-        }
-        #elif CV_NEON
-        uint8x16_t v_one = vdupq_n_u8(1);
-        for( ; x <= size.width - 16; x += 16 )
-        {
-            uint8x16_t v_mask = vcgeq_u8(vld1q_u8(mask + x), v_one);
-            uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x);
-            vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst));
+        #if CV_SIMD128
+        if( hasSIMD128()
+           #if CV_SSE4_2
+           && USE_SSE4_2
+           #endif
+        ) {
+            v_uint8x16 v_zero = v_setzero_u8();
+
+            for( ; x <= size.width - 16; x += 16 )
+            {
+                v_uint8x16 v_src   = v_load(src  + x),
+                           v_dst   = v_load(dst  + x),
+                           v_nmask = v_load(mask + x) == v_zero;
+
+                #if CV_SSE4_2
+                v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val));
+                #else
+                v_dst = v_select(v_nmask, v_dst, v_src);
+                #endif
+                v_store(dst + x, v_dst);
+            }
         }
         #endif
         for( ; x < size.width; x++ )
@@ -130,31 +129,33 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
         const ushort* src = (const ushort*)_src;
         ushort* dst = (ushort*)_dst;
         int x = 0;
-        #if CV_SSE4_2
-        if(USE_SSE4_2)//
-        {
-            __m128i zero = _mm_setzero_si128 ();
-            for( ; x <= size.width - 8; x += 8 )
+        #if CV_SIMD128
+        if( hasSIMD128()
+          #if CV_SSE4_2
+          && USE_SSE4_2
+          #endif
+        ) {
+            v_uint8x16 v_zero = v_setzero_u8();
+
+            for( ; x <= size.width - 16; x += 16 )
             {
-                 const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x));
-                 __m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x));
-                 _mask = _mm_unpacklo_epi8(_mask, _mask);
-                 __m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x));
-                 __m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
-                 rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
-                 _mm_storeu_si128((__m128i*)(dst + x), rDst);
-             }
-        }
-        #elif CV_NEON
-        uint8x8_t v_one = vdup_n_u8(1);
-        for( ; x <= size.width - 8; x += 8 )
-        {
-            uint8x8_t v_mask = vcge_u8(vld1_u8(mask + x), v_one);
-            uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask);
-            uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1]));
-
-            uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x);
-            vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst));
+                v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8),
+                           v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8);
+
+                v_uint8x16 v_nmask1, v_nmask2;
+                v_uint8x16 v_nmask = v_load(mask + x) == v_zero;
+                v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
+
+                #if CV_SSE4_2
+                v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val));
+                v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val));
+                #else
+                v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
+                v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
+                #endif
+                v_store(dst + x, v_dst1);
+                v_store(dst + x + 8, v_dst2);
+            }
         }
         #endif
         for( ; x < size.width; x++ )
-- 
2.7.4