From 6c135261b26e677b9b5c249436a79967d28ae334 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 23 Nov 2017 17:58:34 +0300 Subject: [PATCH] Universal Intrinsics: aligned v_pack behavior on different platforms, fixed 64-bit register on ARM, added more saturate_cast variants --- .../core/include/opencv2/core/hal/intrin_cpp.hpp | 94 ++++++++++++---------- .../core/include/opencv2/core/hal/intrin_neon.hpp | 32 ++++---- .../core/include/opencv2/core/hal/intrin_vsx.hpp | 9 ++- modules/core/include/opencv2/core/saturate.hpp | 19 ++++- modules/core/test/test_intrin_utils.hpp | 49 +++++++---- 5 files changed, 122 insertions(+), 81 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 945fddd..e7ea899 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1762,14 +1762,14 @@ OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64) //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \ +#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpnvec c; \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ { \ - c.s[i] = saturate_cast<_Tpn>(a.s[i]); \ - c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \ + c.s[i] = cast<_Tpn>(a.s[i]); \ + c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \ } \ return c; \ } @@ -1783,26 +1783,28 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ //! //! - pack: for 16-, 32- and 64-bit integer input types //! - pack_u: for 16- and 32-bit signed integer input types -OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack) -OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack) -OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack) -OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack) -OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack) -OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack) -OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u) -OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u) +//! +//! @note All variants except 64-bit use saturation. +OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast) //! @} //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ +#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ { \ _Tpnvec c; \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ { \ - c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ - c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ } \ return c; \ } @@ -1816,51 +1818,55 @@ template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve //! //! - pack: for 16-, 32- and 64-bit integer input types //! - pack_u: for 16- and 32-bit signed integer input types -OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) -OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) +//! +//! @note All variants except 64-bit use saturation. +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) //! @} //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ +#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ { \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ + ptr[i] = cast<_Tpn>(a.s[i]); \ } //! @name Pack and store //! @{ //! @brief Store values from the input vector into memory with pack //! -//! Values will be stored into memory with saturating conversion to narrower type. +//! Values will be stored into memory with conversion to narrower type. //! Variant with _u_ suffix converts to corresponding unsigned type. //! //! - pack: for 16-, 32- and 64-bit integer input types //! - pack_u: for 16- and 32-bit signed integer input types -OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) -OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) -OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) -OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) +//! +//! @note All variants except 64-bit use saturation. +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) //! @} //! @brief Helper macro //! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ +#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ template inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ { \ for( int i = 0; i < _Tpvec::nlanes; i++ ) \ - ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ } //! @name Pack and store with rounding shift @@ -1872,14 +1878,16 @@ template inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec //! //! - pack: for 16-, 32- and 64-bit integer input types //! - pack_u: for 16- and 32-bit signed integer input types -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) -OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) +//! +//! @note All variants except 64-bit use saturation. +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) //! @} /** @brief Matrix multiplication diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index b824ca0..c3c49c9 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -226,7 +226,7 @@ struct v_uint64x2 v_uint64x2() {} explicit v_uint64x2(uint64x2_t v) : val(v) {} - v_uint64x2(unsigned v0, unsigned v1) + v_uint64x2(uint64 v0, uint64 v1) { uint64 v[] = {v0, v1}; val = vld1q_u64(v); @@ -245,7 +245,7 @@ struct v_int64x2 v_int64x2() {} explicit v_int64x2(int64x2_t v) : val(v) {} - v_int64x2(int v0, int v1) + v_int64x2(int64 v0, int64 v1) { int64 v[] = {v0, v1}; val = vld1q_s64(v); @@ -360,40 +360,40 @@ OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32) OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64) #endif -#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \ +#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \ inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ { \ - hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \ + hreg a1 = mov(a.val), b1 = mov(b.val); \ return _Tpvec(vcombine_##suffix(a1, b1)); \ } \ inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ { \ - hreg a1 = vqmov##op##_##wsuffix(a.val); \ + hreg a1 = mov(a.val); \ vst1_##suffix(ptr, a1); \ } \ template inline \ _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ { \ - hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ - hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \ + hreg a1 = rshr(a.val, n); \ + hreg b1 = rshr(b.val, n); \ return _Tpvec(vcombine_##suffix(a1, b1)); \ } \ template inline \ void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ { \ - hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \ + hreg a1 = rshr(a.val, n); \ vst1_##suffix(ptr, a1); \ } -OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n) -OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n) +OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16) +OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16) +OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32) +OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32) +OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64) +OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64) -OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un) -OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un) +OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16) +OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32) inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 446a1ca..79ed8cc 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -394,16 +394,17 @@ OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int, vec_sra, vec_packs, vec_add, pack) OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long, - vec_sr, vec_packs, vec_add, pack) + vec_sr, vec_pack, vec_add, pack) OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long, - vec_sra, vec_packs, vec_add, pack) + vec_sra, vec_pack, vec_add, pack) OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short, vec_sra, vec_packsu, vec_adds, pack_u) OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int, vec_sra, vec_packsu, vec_add, pack_u) -OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long, - vec_sra, vec_packsu, vec_add, pack_u) +// Following variant is not implemented on other platforms: +//OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long, +// vec_sra, vec_packsu, vec_add, pack_u) /* Recombine */ template diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp index d356025..118599f 100644 --- a/modules/core/include/opencv2/core/saturate.hpp +++ b/modules/core/include/opencv2/core/saturate.hpp @@ -136,12 +136,27 @@ template<> inline short saturate_cast(double v) { int iv = cvRound( template<> inline short saturate_cast(int64 v) { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } template<> inline short saturate_cast(uint64 v) { return (short)std::min(v, (uint64)SHRT_MAX); } +template<> inline int saturate_cast(unsigned v) { return (int)std::min(v, (unsigned)INT_MAX); } +template<> inline int saturate_cast(int64 v) { return (int)((uint64)(v - INT_MIN) <= (uint64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); } +template<> inline int saturate_cast(uint64 v) { return (int)std::min(v, (uint64)INT_MAX); } template<> inline int saturate_cast(float v) { return cvRound(v); } template<> inline int saturate_cast(double v) { return cvRound(v); } +template<> inline unsigned saturate_cast(schar v) { return (unsigned)std::max(v, (schar)0); } +template<> inline unsigned saturate_cast(short v) { return (unsigned)std::max(v, (short)0); } +template<> inline unsigned saturate_cast(int v) { return (unsigned)std::max(v, (int)0); } +template<> inline unsigned saturate_cast(int64 v) { return (unsigned)((uint64)v <= (uint64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); } +template<> inline unsigned saturate_cast(uint64 v) { return (unsigned)std::min(v, (uint64)UINT_MAX); } // we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc. -template<> inline unsigned saturate_cast(float v) { return cvRound(v); } -template<> inline unsigned saturate_cast(double v) { return cvRound(v); } +template<> inline unsigned saturate_cast(float v) { return static_cast(cvRound(v)); } +template<> inline unsigned saturate_cast(double v) { return static_cast(cvRound(v)); } + +template<> inline uint64 saturate_cast(schar v) { return (uint64)std::max(v, (schar)0); } +template<> inline uint64 saturate_cast(short v) { return (uint64)std::max(v, (short)0); } +template<> inline uint64 saturate_cast(int v) { return (uint64)std::max(v, (int)0); } +template<> inline uint64 saturate_cast(int64 v) { return (uint64)std::max(v, (int64)0); } + +template<> inline int64 saturate_cast(uint64 v) { return (int64)std::min(v, (uint64)LLONG_MAX); } //! @} diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 678bbd4..0294ef4 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -167,6 +167,14 @@ template<> inline void EXPECT_COMPARE_EQ_(const double a, const double b EXPECT_DOUBLE_EQ( a, b ); } +// pack functions do not do saturation when converting from 64-bit types +template +inline T pack_saturate_cast(W a) { return saturate_cast(a); } +template<> +inline int pack_saturate_cast(int64 a) { return static_cast(a); } +template<> +inline unsigned pack_saturate_cast(uint64 a) { return static_cast(a); } + template struct TheTest { typedef typename R::lane_type LaneType; @@ -464,16 +472,19 @@ template struct TheTest template TheTest & test_shift() { + SCOPED_TRACE(s); Data dataA; + dataA[0] = static_cast(std::numeric_limits::max()); R a = dataA; Data resB = a << s, resC = v_shl(a), resD = a >> s, resE = v_shr(a); + for (int i = 0; i < R::nlanes; ++i) { - EXPECT_EQ(dataA[i] << s, resB[i]); - EXPECT_EQ(dataA[i] << s, resC[i]); - EXPECT_EQ(dataA[i] >> s, resD[i]); - EXPECT_EQ(dataA[i] >> s, resE[i]); + EXPECT_EQ(static_cast(dataA[i] << s), resB[i]); + EXPECT_EQ(static_cast(dataA[i] << s), resC[i]); + EXPECT_EQ(static_cast(dataA[i] >> s), resD[i]); + EXPECT_EQ(static_cast(dataA[i] >> s), resE[i]); } return *this; } @@ -668,11 +679,13 @@ template struct TheTest template TheTest & test_pack() { + SCOPED_TRACE(s); typedef typename V_RegTrait128::w_reg Rx2; typedef typename Rx2::lane_type w_type; Data dataA, dataB; dataA += std::numeric_limits::is_signed ? -10 : 10; dataB *= 10; + dataB[0] = static_cast(std::numeric_limits::max()) + 17; // to check saturation Rx2 a = dataA, b = dataB; Data resC = v_pack(a, b); @@ -688,13 +701,13 @@ template struct TheTest const w_type add = (w_type)1 << (s - 1); for (int i = 0; i < n; ++i) { - EXPECT_EQ(saturate_cast(dataA[i]), resC[i]); - EXPECT_EQ(saturate_cast(dataB[i]), resC[i + n]); - EXPECT_EQ(saturate_cast((dataA[i] + add) >> s), resD[i]); - EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resD[i + n]); - EXPECT_EQ(saturate_cast(dataB[i]), resE[i]); + EXPECT_EQ(pack_saturate_cast(dataA[i]), resC[i]); + EXPECT_EQ(pack_saturate_cast(dataB[i]), resC[i + n]); + EXPECT_EQ(pack_saturate_cast((dataA[i] + add) >> s), resD[i]); + EXPECT_EQ(pack_saturate_cast((dataB[i] + add) >> s), resD[i + n]); + EXPECT_EQ(pack_saturate_cast(dataB[i]), resE[i]); EXPECT_EQ((LaneType)0, resE[i + n]); - EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resF[i]); + EXPECT_EQ(pack_saturate_cast((dataB[i] + add) >> s), resF[i]); EXPECT_EQ((LaneType)0, resF[i + n]); } return *this; @@ -703,6 +716,7 @@ template struct TheTest template TheTest & test_pack_u() { + SCOPED_TRACE(s); typedef typename V_TypeTraits::w_type LaneType_w; typedef typename V_RegTrait128::int_reg Ri2; typedef typename Ri2::lane_type w_type; @@ -710,6 +724,7 @@ template struct TheTest Data dataA, dataB; dataA += -10; dataB *= 10; + dataB[0] = static_cast(std::numeric_limits::max()) + 17; // to check saturation Ri2 a = dataA, b = dataB; Data resC = v_pack_u(a, b); @@ -725,13 +740,13 @@ template struct TheTest const w_type add = (w_type)1 << (s - 1); for (int i = 0; i < n; ++i) { - EXPECT_EQ(saturate_cast(dataA[i]), resC[i]); - EXPECT_EQ(saturate_cast(dataB[i]), resC[i + n]); - EXPECT_EQ(saturate_cast((dataA[i] + add) >> s), resD[i]); - EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resD[i + n]); - EXPECT_EQ(saturate_cast(dataB[i]), resE[i]); + EXPECT_EQ(pack_saturate_cast(dataA[i]), resC[i]); + EXPECT_EQ(pack_saturate_cast(dataB[i]), resC[i + n]); + EXPECT_EQ(pack_saturate_cast((dataA[i] + add) >> s), resD[i]); + EXPECT_EQ(pack_saturate_cast((dataB[i] + add) >> s), resD[i + n]); + EXPECT_EQ(pack_saturate_cast(dataB[i]), resE[i]); EXPECT_EQ((LaneType)0, resE[i + n]); - EXPECT_EQ(saturate_cast((dataB[i] + add) >> s), resF[i]); + EXPECT_EQ(pack_saturate_cast((dataB[i] + add) >> s), resF[i]); EXPECT_EQ((LaneType)0, resF[i + n]); } return *this; @@ -776,6 +791,7 @@ template struct TheTest template TheTest & test_extract() { + SCOPED_TRACE(s); Data dataA, dataB; dataB *= 10; R a = dataA, b = dataB; @@ -796,6 +812,7 @@ template struct TheTest template TheTest & test_rotate() { + SCOPED_TRACE(s); Data dataA, dataB; dataB *= 10; R a = dataA, b = dataB; -- 2.7.4