From 66842f5a18cdf3b0d2118408f0700538bbb79d05 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 18 Sep 2019 17:45:34 +0300 Subject: [PATCH] Extended v_check_any/v_check_all universal intrinsics to support 64-bit integer --- .../core/include/opencv2/core/hal/intrin_avx.hpp | 56 +++++++++------------- .../core/include/opencv2/core/hal/intrin_cpp.hpp | 6 +-- .../core/include/opencv2/core/hal/intrin_neon.hpp | 20 +++++--- .../core/include/opencv2/core/hal/intrin_sse.hpp | 44 ++++++++--------- .../core/include/opencv2/core/hal/intrin_vsx.hpp | 4 ++ 5 files changed, 62 insertions(+), 68 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 74db46b..c821dc3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1241,6 +1241,11 @@ inline int v_signmask(const v_int32x8& a) inline int v_signmask(const v_uint32x8& a) { return v_signmask(v_reinterpret_as_f32(a)); } +inline int v_signmask(const v_int64x4& a) +{ return v_signmask(v_reinterpret_as_f64(a)); } +inline int v_signmask(const v_uint64x4& a) +{ return v_signmask(v_reinterpret_as_f64(a)); } + inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } @@ -1253,40 +1258,23 @@ inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signma inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } /** Checks **/ -#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \ - inline bool v_check_all(const _Tpvec& a) \ - { \ - int mask = v_signmask(v_reinterpret_as_s8(a)); \ - return and_op(mask, allmask) == allmask; \ - } \ - inline bool v_check_any(const _Tpvec& a) \ - { \ - int mask = v_signmask(v_reinterpret_as_s8(a)); \ - return and_op(mask, allmask) != 0; \ - } - -OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, OPENCV_HAL_1ST, -1) -OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, OPENCV_HAL_1ST, -1) -OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaaaaaa) -OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16, OPENCV_HAL_AND, (int)0xaaaaaaaa) -OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, OPENCV_HAL_AND, (int)0x88888888) -OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, OPENCV_HAL_AND, (int)0x88888888) - -#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \ - inline bool v_check_all(const _Tpvec& a) \ - { \ - int mask = v_signmask(a); \ - return mask == allmask; \ - } \ - inline bool v_check_any(const _Tpvec& a) \ - { \ - int mask = v_signmask(a); \ - return mask != 0; \ - } - -OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255) -OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15) - +#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \ + inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \ + inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; } +OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1) +OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1) +OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255) +OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255) +OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15) +OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15) +OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255) +OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15) + +#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \ + inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \ + inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; } +OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16) +OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16) ////////// Other math ///////// diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index d494761..75c39ac 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1080,7 +1080,7 @@ Example: v_int32x4 r; // set to {-1, -1, 1, 1} int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011 @endcode -For all types except 64-bit. */ +*/ template inline int v_signmask(const v_reg<_Tp, n>& a) { int mask = 0; @@ -1109,7 +1109,7 @@ template inline int v_scan_forward(const v_reg<_Tp, n>& a) /** @brief Check if all packed values are less than zero Unsigned values will be casted to signed: `uchar 254 => char -2`. -For all types except 64-bit. */ +*/ template inline bool v_check_all(const v_reg<_Tp, n>& a) { for( int i = 0; i < n; i++ ) @@ -1121,7 +1121,7 @@ template inline bool v_check_all(const v_reg<_Tp, n>& a) /** @brief Check if any of packed values is less than zero Unsigned values will be casted to signed: `uchar 254 => char -2`. -For all types except 64-bit. */ +*/ template inline bool v_check_any(const v_reg<_Tp, n>& a) { for( int i = 0; i < n; i++ ) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 5617bc2..e5f707c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1139,9 +1139,17 @@ inline bool v_check_any(const v_##_Tpvec& a) \ OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7) OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15) OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31) -#if CV_SIMD128_64F -OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63) -#endif + +inline bool v_check_all(const v_uint64x2& a) +{ + uint64x2_t v0 = vshrq_n_u64(a.val, 63); + return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1; +} +inline bool v_check_any(const v_uint64x2& a) +{ + uint64x2_t v0 = vshrq_n_u64(a.val, 63); + return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0; +} inline bool v_check_all(const v_int8x16& a) { return v_check_all(v_reinterpret_as_u8(a)); } @@ -1161,13 +1169,13 @@ inline bool v_check_any(const v_int32x4& a) inline bool v_check_any(const v_float32x4& a) { return v_check_any(v_reinterpret_as_u32(a)); } -#if CV_SIMD128_64F inline bool v_check_all(const v_int64x2& a) { return v_check_all(v_reinterpret_as_u64(a)); } -inline bool v_check_all(const v_float64x2& a) -{ return v_check_all(v_reinterpret_as_u64(a)); } inline bool v_check_any(const v_int64x2& a) { return v_check_any(v_reinterpret_as_u64(a)); } +#if CV_SIMD128_64F +inline bool v_check_all(const v_float64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } inline bool v_check_any(const v_float64x2& a) { return v_check_any(v_reinterpret_as_u64(a)); } #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index e172d45..f661c58 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1591,31 +1591,25 @@ inline v_uint32x4 v_popcount(const v_int32x4& a) inline v_uint64x2 v_popcount(const v_int64x2& a) { return v_popcount(v_reinterpret_as_u64(a)); } -#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ -inline int v_signmask(const _Tpvec& a) \ -{ \ - return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \ -} \ -inline bool v_check_all(const _Tpvec& a) \ -{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \ -inline bool v_check_any(const _Tpvec& a) \ -{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; } - -#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a) -inline __m128i v_packq_epi32(__m128i a) -{ - __m128i b = _mm_packs_epi32(a, a); - return _mm_packs_epi16(b, b); -} - -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) -OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) +#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \ +inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \ +inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \ +inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; } +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3) + +#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \ +inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \ +inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \ +inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; } +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8) +OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8) inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 5e417a0..4f41021 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -899,6 +899,8 @@ inline bool v_check_all(const v_uint16x8& a) { return v_check_all(v_reinterpret_as_s16(a)); } inline bool v_check_all(const v_uint32x4& a) { return v_check_all(v_reinterpret_as_s32(a)); } +inline bool v_check_all(const v_uint64x2& a) +{ return v_check_all(v_reinterpret_as_s64(a)); } inline bool v_check_all(const v_float32x4& a) { return v_check_all(v_reinterpret_as_s32(a)); } inline bool v_check_all(const v_float64x2& a) @@ -913,6 +915,8 @@ inline bool v_check_any(const v_uint16x8& a) { return v_check_any(v_reinterpret_as_s16(a)); } inline bool v_check_any(const v_uint32x4& a) { return v_check_any(v_reinterpret_as_s32(a)); } +inline bool v_check_any(const v_uint64x2& a) +{ return v_check_any(v_reinterpret_as_s64(a)); } inline bool v_check_any(const v_float32x4& a) { return v_check_any(v_reinterpret_as_s32(a)); } inline bool v_check_any(const v_float64x2& a) -- 2.7.4