return c;
}
+/** @brief Sums all elements of each input vector, returns the vector of sums
+
+ Scheme:
+ @code
+ result[0] = a[0] + a[1] + a[2] + a[3]
+ result[1] = b[0] + b[1] + b[2] + b[3]
+ result[2] = c[0] + c[1] + c[2] + c[3]
+ result[3] = d[0] + d[1] + d[2] + d[3]
+ @endcode
+*/
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+ const v_float32x4& c, const v_float32x4& d)
+{
+ v_float32x4 r;
+ r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
+ r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
+ r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
+ r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
+ return r;
+}
+
/** @brief Get negative values mask
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+ const v_float32x4& c, const v_float32x4& d)
+{
+ float32x4x2_t ab = vtrnq_f32(a.val, b.val);
+ float32x4x2_t cd = vtrnq_f32(c.val, d.val);
+
+ float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
+ float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
+
+ float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
+ float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
+
+ return v_float32x4(vaddq_f32(v0, v1));
+}
+
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+ const v_float32x4& c, const v_float32x4& d)
+{
+ __m128 ab = _mm_hadd_ps(a.val, b.val);
+ __m128 cd = _mm_hadd_ps(c.val, d.val);
+ return v_float32x4(_mm_hadd_ps(ab, cd));
+}
+
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
#elif defined HAVE_GCD
- return 512; // the GCD thread pool limit
+ return cv::getNumberOfCPUs(); // the GCD thread pool limit
#elif defined WINRT