From: Tomoaki Teshima Date: Sat, 14 Oct 2017 15:24:31 +0000 (+0900) Subject: remove raw SSE2/NEON implementation from mathfuncs.cpp X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~497^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2a781bb6164e14086dcae7d392a69ac883478e13;p=platform%2Fupstream%2Fopencv.git remove raw SSE2/NEON implementation from mathfuncs.cpp * replace the implementation by universal intrinsic * make sure no degradation happens on ARM platform --- diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index a3dfbd1..be6e7de 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -605,24 +605,18 @@ void polarToCart( InputArray src1, InputArray src2, { k = 0; - #if CV_NEON - for( ; k <= len - 4; k += 4 ) - { - float32x4_t v_m = vld1q_f32(mag + k); - vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m)); - vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); - } - #elif CV_SSE2 - if (USE_SSE2) +#if CV_SIMD128 + if( hasSIMD128() ) { - for( ; k <= len - 4; k += 4 ) + int cWidth = v_float32x4::nlanes; + for( ; k <= len - cWidth; k += cWidth ) { - __m128 v_m = _mm_loadu_ps(mag + k); - _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); - _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + v_float32x4 v_m = v_load(mag + k); + v_store(x + k, v_load(x + k) * v_m); + v_store(y + k, v_load(y + k) * v_m); } } - #endif +#endif for( ; k < len; k++ ) { @@ -1599,12 +1593,9 @@ void patchNaNs( InputOutputArray _a, double _val ) Cv32suf val; val.f = (float)_val; -#if CV_SSE2 - __m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000); - __m128i v_val = _mm_set1_epi32(val.i); -#elif CV_NEON - int32x4_t v_mask1 = vdupq_n_s32(0x7fffffff), v_mask2 = vdupq_n_s32(0x7f800000), - v_val = vdupq_n_s32(val.i); +#if CV_SIMD128 + v_int32x4 v_mask1 = v_setall_s32(0x7fffffff), v_mask2 = v_setall_s32(0x7f800000); + v_int32x4 v_val = v_setall_s32(val.i); #endif for( size_t i = 0; i < it.nplanes; i++, ++it ) @@ -1612,25 +1603,18 @@ void patchNaNs( InputOutputArray _a, double _val ) int* tptr = ptrs[0]; size_t j = 0; -#if CV_SSE2 - if (USE_SSE2) +#if CV_SIMD128 + if( hasSIMD128() ) { - for ( ; j + 4 <= len; j += 4) + size_t cWidth = (size_t)v_int32x4::nlanes; + for ( ; j + cWidth <= len; j += cWidth) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j)); - __m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1)); - __m128i v_res = _mm_or_si128(_mm_andnot_si128(v_cmp_mask, v_src), _mm_and_si128(v_cmp_mask, v_val)); - _mm_storeu_si128((__m128i *)(tptr + j), v_res); + v_int32x4 v_src = v_load(tptr + j); + v_int32x4 v_cmp_mask = v_mask2 < (v_src & v_mask1); + v_int32x4 v_dst = v_select(v_cmp_mask, v_val, v_src); + v_store(tptr + j, v_dst); } } -#elif CV_NEON - for ( ; j + 4 <= len; j += 4) - { - int32x4_t v_src = vld1q_s32(tptr + j); - uint32x4_t v_cmp_mask = vcltq_s32(v_mask2, vandq_s32(v_src, v_mask1)); - int32x4_t v_dst = vbslq_s32(v_cmp_mask, v_val, v_src); - vst1q_s32(tptr + j, v_dst); - } #endif for( ; j < len; j++ )