From 903789f7afc7fd88e28c9bc62cdbbe00e911b972 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Fri, 2 Sep 2016 21:57:46 +0900 Subject: [PATCH] use universal intrinsic for FP16 * use v_float16x4 (universal intrinsic) instead of raw SSE/NEON implementation * define v_load_f16/v_store_f16 since v_load can't be distinguished when short pointer passed * brush up implementation on old compiler (guard correctly) * add test for v_load_f16 and round trip conversion of v_float16x4 * fix conversion error --- .../core/include/opencv2/core/hal/intrin_neon.hpp | 51 +++++++++++++++++++++ .../core/include/opencv2/core/hal/intrin_sse.hpp | 39 ++++++++++++++++ modules/core/src/convert.cpp | 42 +++-------------- modules/core/test/test_intrin.cpp | 53 ++++++++++++++++++++++ 4 files changed, 149 insertions(+), 36 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index e5a42aa..dd5e2e9 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -275,6 +275,39 @@ struct v_float64x2 }; #endif +#if defined (HAVE_FP16) +// Workaround for old comiplers +template static inline int16x4_t vreinterpret_s16_f16(T a) +{ return (int16x4_t)a; } +template static inline float16x4_t vreinterpret_f16_s16(T a) +{ return (float16x4_t)a; } +template static inline float16x4_t vld1_f16(const T* ptr) +{ return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); } +template static inline void vst1_f16(T* ptr, float16x4_t a) +{ vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); } +static inline short vget_lane_f16(float16x4_t a, int b) +{ return vget_lane_s16(vreinterpret_s16_f16(a), b); } + +struct v_float16x4 +{ + typedef short lane_type; + enum { nlanes = 4 }; + + v_float16x4() {} + explicit v_float16x4(float16x4_t v) : val(v) {} + v_float16x4(short v0, short v1, short v2, short v3) + { + short v[] = {v0, v1, v2, v3}; + val = vld1_f16(v); + } + short get0() const + { + return vget_lane_f16(val, 0); + } + float16x4_t val; +}; +#endif + #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ @@ -734,6 +767,14 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) #endif +#if defined (HAVE_FP16) +// Workaround for old comiplers +inline v_float16x4 v_load_f16(const short* ptr) +{ return v_float16x4(vld1_f16(ptr)); } +inline void v_store_f16(short* ptr, v_float16x4& a) +{ vst1_f16(ptr, a.val); } +#endif + #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ @@ -1146,7 +1187,17 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) } #endif +#if defined (HAVE_FP16) +inline v_float32x4 v_cvt_f32(const v_float16x4& a) +{ + return v_float32x4(vcvt_f32_f16(a.val)); +} +inline v_float16x4 v_cvt_f16(const v_float32x4& a) +{ + return v_float16x4(vcvt_f16_f32(a.val)); +} +#endif //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 2efd615..3e2ebae 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -252,6 +252,26 @@ struct v_float64x2 __m128d val; }; +#if defined(HAVE_FP16) +struct v_float16x4 +{ + typedef short lane_type; + enum { nlanes = 4 }; + + v_float16x4() {} + explicit v_float16x4(__m128i v) : val(v) {} + v_float16x4(short v0, short v1, short v2, short v3) + { + val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0); + } + short get0() const + { + return (short)_mm_cvtsi128_si32(val); + } + __m128i val; +}; +#endif + #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ @@ -1021,6 +1041,13 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) +#if defined(HAVE_FP16) +inline v_float16x4 v_load_f16(const short* ptr) +{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); } +inline void v_store_f16(short* ptr, v_float16x4& a) +{ _mm_storel_epi64((__m128i*)ptr, a.val); } +#endif + #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ @@ -1626,6 +1653,18 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8)))); } +#if defined(HAVE_FP16) +inline v_float32x4 v_cvt_f32(const v_float16x4& a) +{ + return v_float32x4(_mm_cvtph_ps(a.val)); +} + +inline v_float16x4 v_cvt_f16(const v_float32x4& a) +{ + return v_float16x4(_mm_cvtps_ph(a.val, 0)); +} +#endif + //! @endcond } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 0305785..c405919 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32) } #endif -#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__) - #if 5 <= __GNUC__ - static inline float16x4_t load_f16(const short* p) { return vld1_f16((const float16_t*)p); } - static inline void store_f16(short* p, float16x4_t v) { vst1_f16((float16_t*)p, v); } - #else - static inline float16x4_t load_f16(const short* p) { return (float16x4_t)vld1_s16(p); } - static inline void store_f16(short* p, float16x4_t v) { vst1_s16(p, (int16x4_t)v); } - #endif -#endif - // template for FP16 HW conversion function template static void cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size); @@ -4570,21 +4560,11 @@ cvtScaleHalf_( const float* src, size_t sstep, short* dst, size_t #if CV_FP16 for ( ; x <= size.width - 4; x += 4) { -#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) - __m128 v_src = _mm_loadu_ps(src + x); + v_float32x4 v_src = v_load(src + x); - __m128i v_dst = _mm_cvtps_ph(v_src, 0); + v_float16x4 v_dst = v_cvt_f16(v_src); - _mm_storel_epi64((__m128i *)(dst + x), v_dst); -#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) - float32x4_t v_src = vld1q_f32(src + x); - - float16x4_t v_dst = vcvt_f16_f32(v_src); - - store_f16(dst + x, v_dst); -#else -#error "Configuration error" -#endif + v_store_f16(dst + x, v_dst); } #endif } @@ -4626,21 +4606,11 @@ cvtScaleHalf_( const short* src, size_t sstep, float* dst, size_t #if CV_FP16 for ( ; x <= size.width - 4; x += 4) { -#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) - __m128i v_src = _mm_loadl_epi64((__m128i*)(src+x)); - - __m128 v_dst = _mm_cvtph_ps(v_src); + v_float16x4 v_src = v_load_f16(src + x); - _mm_storeu_ps(dst + x, v_dst); -#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) - float16x4_t v_src = load_f16(src+x); + v_float32x4 v_dst = v_cvt_f32(v_src); - float32x4_t v_dst = vcvt_f32_f16(v_src); - - vst1q_f32(dst + x, v_dst); -#else -#error "Configuration error" -#endif + v_store(dst + x, v_dst); } #endif } diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index e9c8cc4..d9704ef 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -1,3 +1,4 @@ +#include "test_precomp.hpp" #include "test_intrin_utils.hpp" #include @@ -710,6 +711,49 @@ template struct TheTest return *this; } +#if CV_FP16 + TheTest & test_loadstore_fp16() + { + AlignedData data; + AlignedData out; + + // check if addresses are aligned and unaligned respectively + EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16); + EXPECT_NE((size_t)0, (size_t)&data.u.d % 16); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16); + EXPECT_NE((size_t)0, (size_t)&out.u.d % 16); + + // check some initialization methods + R r1 = data.u; + R r2 = v_load_f16(data.a.d); + R r3(r2); + EXPECT_EQ(data.u[0], r1.get0()); + EXPECT_EQ(data.a[0], r2.get0()); + EXPECT_EQ(data.a[0], r3.get0()); + + // check some store methods + out.a.clear(); + v_store_f16(out.a.d, r1); + EXPECT_EQ(data.a, out.a); + + return *this; + } + + TheTest & test_float_cvt_fp16() + { + AlignedData data; + + // check conversion + v_float32x4 r1 = v_load(data.a.d); + v_float16x4 r2 = v_cvt_f16(r1); + v_float32x4 r3 = v_cvt_f32(r2); + EXPECT_EQ(0x3c00, r2.get0()); + EXPECT_EQ(r3.get0(), r1.get0()); + + return *this; + } +#endif + }; @@ -915,6 +959,15 @@ TEST(hal_intrin, float64x2) { } #endif +#if CV_FP16 +TEST(hal_intrin, float16x4) { + TheTest() + .test_loadstore_fp16() + .test_float_cvt_fp16() + ; +} +#endif + }; }; -- 2.7.4