From: amatyuko Date: Tue, 31 Jul 2018 18:54:15 +0000 (+0300) Subject: Fix for SSE2 intrinsics problem in the part of saturation arithmetic processing durin... X-Git-Tag: submit/tizen/20200407.083853~1^2~544^2~55^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3ea2586a5a1749877e4fc865d29bdf31c3bff956;p=platform%2Fupstream%2Fopencv.git Fix for SSE2 intrinsics problem in the part of saturation arithmetic processing during 32s->16u packed conversion - for some big negative values less than -INT_MAX+32767 the sign of the numbers is lost due to overflow that leads to incorrect saturation to MAX value, instead of zero. The issue is not reproduced with CV_ENABLED_INTRINSICS=OFF --- diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index d1f24d17b5..e58486fb5d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -494,7 +494,12 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) { __m128i delta32 = _mm_set1_epi32(32768); - __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32)); + + // preliminary saturate negative values to zero + __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0))); + __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0))); + + __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32)); return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768))); } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 481b86b4f1..e5fd24dfad 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -453,9 +453,9 @@ struct Cvt_SIMD { v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth); v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3); - v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2); - v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4); - v_store(dst + x, v_pack(v_dst1, v_dst2)); + v_int16x8 v_dst1 = v_pack(v_src1, v_src2); + v_int16x8 v_dst2 = v_pack(v_src3, v_src4); + v_store(dst + x, v_pack_u(v_dst1, v_dst2)); } } return x;