From: Vadim Pisarevsky Date: Tue, 13 Jun 2017 15:04:00 +0000 (+0300) Subject: added v_reduce_sum4() universal intrinsic; corrected number of threads in cv::getNumT... X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~944^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fbafc700ea9b2cf93bf2fb12edd6d8b207b0547e;p=platform%2Fupstream%2Fopencv.git added v_reduce_sum4() universal intrinsic; corrected number of threads in cv::getNumThreads() in the case of GCD --- diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index a5675e4..0a59e9b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -907,6 +907,27 @@ template inline typename V_TypeTraits<_Tp>::sum_type v_redu return c; } +/** @brief Sums all elements of each input vector, returns the vector of sums + + Scheme: + @code + result[0] = a[0] + a[1] + a[2] + a[3] + result[1] = b[0] + b[1] + b[2] + b[3] + result[2] = c[0] + c[1] + c[2] + c[3] + result[3] = d[0] + d[1] + d[2] + d[3] + @endcode +*/ +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + v_float32x4 r; + r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3]; + r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3]; + r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3]; + r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3]; + return r; +} + /** @brief Get negative values mask Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 6dd02bd..b126948 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -815,6 +815,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + float32x4x2_t ab = vtrnq_f32(a.val, b.val); + float32x4x2_t cd = vtrnq_f32(c.val, d.val); + + float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3 + float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3 + + float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1)); + float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1)); + + return v_float32x4(vaddq_f32(v0, v1)); +} + #define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \ inline v_uint32x4 v_popcount(const _Tpvec& a) \ { \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 364f5d7..3291921 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1126,6 +1126,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + __m128 ab = _mm_hadd_ps(a.val, b.val); + __m128 cd = _mm_hadd_ps(c.val, d.val); + return v_float32x4(_mm_hadd_ps(ab, cd)); +} + OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max) diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp index ac5f2e5..3bbf028 100644 --- a/modules/core/src/parallel.cpp +++ b/modules/core/src/parallel.cpp @@ -425,7 +425,7 @@ int cv::getNumThreads(void) #elif defined HAVE_GCD - return 512; // the GCD thread pool limit + return cv::getNumberOfCPUs(); // the GCD thread pool limit #elif defined WINRT