From fd0ac962fbccce6f886d8c14e1a47dbe5b37d2a2 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Sat, 23 Dec 2017 08:59:24 +0200 Subject: [PATCH] core: replace raw intrinsics with universal intrinsics in copy.cpp - use universal intrinsic instead of raw intrinsic - add performance check for Mat::copyTo/setTo with mask --- modules/core/perf/perf_mat.cpp | 43 ++++++++++++++++++- modules/core/src/copy.cpp | 93 +++++++++++++++++++++--------------------- 2 files changed, 89 insertions(+), 47 deletions(-) diff --git a/modules/core/perf/perf_mat.cpp b/modules/core/perf/perf_mat.cpp index 7066c5b..fa87709 100644 --- a/modules/core/perf/perf_mat.cpp +++ b/modules/core/perf/perf_mat.cpp @@ -57,7 +57,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone, Size size = get<0>(GetParam()); int type = get<1>(GetParam()); Mat source(size.height, size.width, type); - Mat destination(size.height, size.width, type);; + Mat destination(size.height, size.width, type); declare.in(source, WARMUP_RNG).out(destination); @@ -97,6 +97,47 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi, SANITY_CHECK(destination, 1); } +PERF_TEST_P(Size_MatType, Mat_CopyToWithMask, + testing::Combine(testing::Values(TYPICAL_MAT_SIZES), + testing::Values(CV_8UC1, CV_8UC2)) + ) +{ + const Size_MatType_t params = GetParam(); + const Size size = get<0>(params); + const int type = get<1>(params); + + Mat src(size, type), dst(size, type), mask(size, CV_8UC1); + declare.in(src, mask, WARMUP_RNG).out(dst); + + TEST_CYCLE() + { + src.copyTo(dst, mask); + } + + SANITY_CHECK(dst); +} + +PERF_TEST_P(Size_MatType, Mat_SetToWithMask, + testing::Combine(testing::Values(TYPICAL_MAT_SIZES), + testing::Values(CV_8UC1, CV_8UC2)) + ) +{ + const Size_MatType_t params = GetParam(); + const Size size = get<0>(params); + const int type = get<1>(params); + const Scalar sc = Scalar::all(27); + + Mat src(size, type), mask(size, CV_8UC1); + declare.in(src, mask, WARMUP_RNG).out(src); + + TEST_CYCLE() + { + src.setTo(sc, mask); + } + + SANITY_CHECK(src); +} + ///////////// Transform //////////////////////// PERF_TEST_P(Size_MatType, Mat_Transform, diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 49c9cb7..554256d 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -90,28 +90,27 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste const uchar* src = (const uchar*)_src; uchar* dst = (uchar*)_dst; int x = 0; - #if CV_SSE4_2 - if(USE_SSE4_2)// - { - __m128i zero = _mm_setzero_si128 (); - - for( ; x <= size.width - 16; x += 16 ) - { - const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x)); - __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); - __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); - __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); - rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); - _mm_storeu_si128((__m128i*)(dst + x), rDst); - } - } - #elif CV_NEON - uint8x16_t v_one = vdupq_n_u8(1); - for( ; x <= size.width - 16; x += 16 ) - { - uint8x16_t v_mask = vcgeq_u8(vld1q_u8(mask + x), v_one); - uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x); - vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst)); + #if CV_SIMD128 + if( hasSIMD128() + #if CV_SSE4_2 + && USE_SSE4_2 + #endif + ) { + v_uint8x16 v_zero = v_setzero_u8(); + + for( ; x <= size.width - 16; x += 16 ) + { + v_uint8x16 v_src = v_load(src + x), + v_dst = v_load(dst + x), + v_nmask = v_load(mask + x) == v_zero; + + #if CV_SSE4_2 + v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val)); + #else + v_dst = v_select(v_nmask, v_dst, v_src); + #endif + v_store(dst + x, v_dst); + } } #endif for( ; x < size.width; x++ ) @@ -130,31 +129,33 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst const ushort* src = (const ushort*)_src; ushort* dst = (ushort*)_dst; int x = 0; - #if CV_SSE4_2 - if(USE_SSE4_2)// - { - __m128i zero = _mm_setzero_si128 (); - for( ; x <= size.width - 8; x += 8 ) + #if CV_SIMD128 + if( hasSIMD128() + #if CV_SSE4_2 + && USE_SSE4_2 + #endif + ) { + v_uint8x16 v_zero = v_setzero_u8(); + + for( ; x <= size.width - 16; x += 16 ) { - const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x)); - __m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x)); - _mask = _mm_unpacklo_epi8(_mask, _mask); - __m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x)); - __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); - rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); - _mm_storeu_si128((__m128i*)(dst + x), rDst); - } - } - #elif CV_NEON - uint8x8_t v_one = vdup_n_u8(1); - for( ; x <= size.width - 8; x += 8 ) - { - uint8x8_t v_mask = vcge_u8(vld1_u8(mask + x), v_one); - uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask); - uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1])); - - uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x); - vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst)); + v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8), + v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8); + + v_uint8x16 v_nmask1, v_nmask2; + v_uint8x16 v_nmask = v_load(mask + x) == v_zero; + v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2); + + #if CV_SSE4_2 + v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val)); + v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val)); + #else + v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1); + v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2); + #endif + v_store(dst + x, v_dst1); + v_store(dst + x + 8, v_dst2); + } } #endif for( ; x < size.width; x++ ) -- 2.7.4