From 903789f7afc7fd88e28c9bc62cdbbe00e911b972 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Fri, 2 Sep 2016 21:57:46 +0900
Subject: [PATCH] use universal intrinsic for FP16   * use v_float16x4
 (universal intrinsic) instead of raw SSE/NEON implementation   * define
 v_load_f16/v_store_f16 since v_load can't be distinguished when short pointer
 passed   * brush up implementation on old compiler (guard correctly)   * add
 test for v_load_f16 and round trip conversion of v_float16x4   * fix
 conversion error

---
 .../core/include/opencv2/core/hal/intrin_neon.hpp  | 51 +++++++++++++++++++++
 .../core/include/opencv2/core/hal/intrin_sse.hpp   | 39 ++++++++++++++++
 modules/core/src/convert.cpp                       | 42 +++--------------
 modules/core/test/test_intrin.cpp                  | 53 ++++++++++++++++++++++
 4 files changed, 149 insertions(+), 36 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index e5a42aa..dd5e2e9 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -275,6 +275,39 @@ struct v_float64x2
 };
 #endif
 
+#if defined (HAVE_FP16)
+// Workaround for old comiplers
+template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
+{ return (int16x4_t)a; }
+template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
+{ return (float16x4_t)a; }
+template <typename T> static inline float16x4_t vld1_f16(const T* ptr)
+{ return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); }
+template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a)
+{ vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); }
+static inline short vget_lane_f16(float16x4_t a, int b)
+{ return vget_lane_s16(vreinterpret_s16_f16(a), b); }
+
+struct v_float16x4
+{
+    typedef short lane_type;
+    enum { nlanes = 4 };
+
+    v_float16x4() {}
+    explicit v_float16x4(float16x4_t v) : val(v) {}
+    v_float16x4(short v0, short v1, short v2, short v3)
+    {
+        short v[] = {v0, v1, v2, v3};
+        val = vld1_f16(v);
+    }
+    short get0() const
+    {
+        return vget_lane_f16(val, 0);
+    }
+    float16x4_t val;
+};
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
@@ -734,6 +767,14 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif
 
+#if defined (HAVE_FP16)
+// Workaround for old comiplers
+inline v_float16x4 v_load_f16(const short* ptr)
+{ return v_float16x4(vld1_f16(ptr)); }
+inline void v_store_f16(short* ptr, v_float16x4& a)
+{ vst1_f16(ptr, a.val); }
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1146,7 +1187,17 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif
 
+#if defined (HAVE_FP16)
+inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+{
+    return v_float32x4(vcvt_f32_f16(a.val));
+}
 
+inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+{
+    return v_float16x4(vcvt_f16_f32(a.val));
+}
+#endif
 
 //! @endcond
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 2efd615..3e2ebae 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -252,6 +252,26 @@ struct v_float64x2
     __m128d val;
 };
 
+#if defined(HAVE_FP16)
+struct v_float16x4
+{
+    typedef short lane_type;
+    enum { nlanes = 4 };
+
+    v_float16x4() {}
+    explicit v_float16x4(__m128i v) : val(v) {}
+    v_float16x4(short v0, short v1, short v2, short v3)
+    {
+        val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+    }
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+#endif
+
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
@@ -1021,6 +1041,13 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
 
+#if defined(HAVE_FP16)
+inline v_float16x4 v_load_f16(const short* ptr)
+{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
+inline void v_store_f16(short* ptr, v_float16x4& a)
+{ _mm_storel_epi64((__m128i*)ptr, a.val); }
+#endif
+
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1626,6 +1653,18 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
     return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
 }
 
+#if defined(HAVE_FP16)
+inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+{
+    return v_float32x4(_mm_cvtph_ps(a.val));
+}
+
+inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+{
+    return v_float16x4(_mm_cvtps_ph(a.val, 0));
+}
+#endif
+
 //! @endcond
 
 }
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 0305785..c405919 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32)
 }
 #endif
 
-#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__)
-    #if 5 <= __GNUC__
-    static inline float16x4_t load_f16(const short* p) { return vld1_f16((const float16_t*)p); }
-    static inline void store_f16(short* p, float16x4_t v) { vst1_f16((float16_t*)p, v); }
-    #else
-    static inline float16x4_t load_f16(const short* p) { return (float16x4_t)vld1_s16(p); }
-    static inline void store_f16(short* p, float16x4_t v) { vst1_s16(p, (int16x4_t)v); }
-    #endif
-#endif
-
 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
 cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
@@ -4570,21 +4560,11 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
 #if CV_FP16
                 for ( ; x <= size.width - 4; x += 4)
                 {
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128 v_src = _mm_loadu_ps(src + x);
+                    v_float32x4 v_src = v_load(src + x);
 
-                    __m128i v_dst = _mm_cvtps_ph(v_src, 0);
+                    v_float16x4 v_dst = v_cvt_f16(v_src);
 
-                    _mm_storel_epi64((__m128i *)(dst + x), v_dst);
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float32x4_t v_src = vld1q_f32(src + x);
-
-                    float16x4_t v_dst = vcvt_f16_f32(v_src);
-
-                    store_f16(dst + x, v_dst);
-#else
-#error "Configuration error"
-#endif
+                    v_store_f16(dst + x, v_dst);
                 }
 #endif
             }
@@ -4626,21 +4606,11 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
 #if CV_FP16
                 for ( ; x <= size.width - 4; x += 4)
                 {
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128i v_src = _mm_loadl_epi64((__m128i*)(src+x));
-
-                    __m128 v_dst = _mm_cvtph_ps(v_src);
+                    v_float16x4 v_src = v_load_f16(src + x);
 
-                    _mm_storeu_ps(dst + x, v_dst);
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float16x4_t v_src = load_f16(src+x);
+                    v_float32x4 v_dst = v_cvt_f32(v_src);
 
-                    float32x4_t v_dst = vcvt_f32_f16(v_src);
-
-                    vst1q_f32(dst + x, v_dst);
-#else
-#error "Configuration error"
-#endif
+                    v_store(dst + x, v_dst);
                 }
 #endif
             }
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index e9c8cc4..d9704ef 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -1,3 +1,4 @@
+#include "test_precomp.hpp"
 #include "test_intrin_utils.hpp"
 #include <climits>
 
@@ -710,6 +711,49 @@ template<typename R> struct TheTest
         return *this;
     }
 
+#if CV_FP16
+    TheTest & test_loadstore_fp16()
+    {
+        AlignedData<R> data;
+        AlignedData<R> out;
+
+        // check if addresses are aligned and unaligned respectively
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+        // check some initialization methods
+        R r1 = data.u;
+        R r2 = v_load_f16(data.a.d);
+        R r3(r2);
+        EXPECT_EQ(data.u[0], r1.get0());
+        EXPECT_EQ(data.a[0], r2.get0());
+        EXPECT_EQ(data.a[0], r3.get0());
+
+        // check some store methods
+        out.a.clear();
+        v_store_f16(out.a.d, r1);
+        EXPECT_EQ(data.a, out.a);
+
+        return *this;
+    }
+
+    TheTest & test_float_cvt_fp16()
+    {
+        AlignedData<v_float32x4> data;
+
+        // check conversion
+        v_float32x4 r1 = v_load(data.a.d);
+        v_float16x4 r2 = v_cvt_f16(r1);
+        v_float32x4 r3 = v_cvt_f32(r2);
+        EXPECT_EQ(0x3c00, r2.get0());
+        EXPECT_EQ(r3.get0(), r1.get0());
+
+        return *this;
+    }
+#endif
+
 };
 
 
@@ -915,6 +959,15 @@ TEST(hal_intrin, float64x2) {
 }
 #endif
 
+#if CV_FP16
+TEST(hal_intrin, float16x4) {
+    TheTest<v_float16x4>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+}
+#endif
+
 };
 
 };
-- 
2.7.4