* use v_float16x4 (universal intrinsic) instead of raw SSE/NEON implementation
* define v_load_f16/v_store_f16 since v_load can't be distinguished when short pointer passed
* brush up implementation on old compiler (guard correctly)
* add test for v_load_f16 and round trip conversion of v_float16x4
* fix conversion error
};
#endif
+#if defined (HAVE_FP16)
+// Workaround for old comiplers
+template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
+{ return (int16x4_t)a; }
+template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
+{ return (float16x4_t)a; }
+template <typename T> static inline float16x4_t vld1_f16(const T* ptr)
+{ return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); }
+template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a)
+{ vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); }
+static inline short vget_lane_f16(float16x4_t a, int b)
+{ return vget_lane_s16(vreinterpret_s16_f16(a), b); }
+
+struct v_float16x4
+{
+ typedef short lane_type;
+ enum { nlanes = 4 };
+
+ v_float16x4() {}
+ explicit v_float16x4(float16x4_t v) : val(v) {}
+ v_float16x4(short v0, short v1, short v2, short v3)
+ {
+ short v[] = {v0, v1, v2, v3};
+ val = vld1_f16(v);
+ }
+ short get0() const
+ {
+ return vget_lane_f16(val, 0);
+ }
+ float16x4_t val;
+};
+#endif
+
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
#endif
+#if defined (HAVE_FP16)
+// Workaround for old comiplers
+inline v_float16x4 v_load_f16(const short* ptr)
+{ return v_float16x4(vld1_f16(ptr)); }
+inline void v_store_f16(short* ptr, v_float16x4& a)
+{ vst1_f16(ptr, a.val); }
+#endif
+
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
}
#endif
+#if defined (HAVE_FP16)
+inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+{
+ return v_float32x4(vcvt_f32_f16(a.val));
+}
+inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+{
+ return v_float16x4(vcvt_f16_f32(a.val));
+}
+#endif
//! @endcond
__m128d val;
};
+#if defined(HAVE_FP16)
+struct v_float16x4
+{
+ typedef short lane_type;
+ enum { nlanes = 4 };
+
+ v_float16x4() {}
+ explicit v_float16x4(__m128i v) : val(v) {}
+ v_float16x4(short v0, short v1, short v2, short v3)
+ {
+ val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+ }
+ short get0() const
+ {
+ return (short)_mm_cvtsi128_si32(val);
+ }
+ __m128i val;
+};
+#endif
+
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+#if defined(HAVE_FP16)
+inline v_float16x4 v_load_f16(const short* ptr)
+{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
+inline void v_store_f16(short* ptr, v_float16x4& a)
+{ _mm_storel_epi64((__m128i*)ptr, a.val); }
+#endif
+
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
}
+#if defined(HAVE_FP16)
+inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+{
+ return v_float32x4(_mm_cvtph_ps(a.val));
+}
+
+inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+{
+ return v_float16x4(_mm_cvtps_ph(a.val, 0));
+}
+#endif
+
//! @endcond
}
}
#endif
-#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__)
- #if 5 <= __GNUC__
- static inline float16x4_t load_f16(const short* p) { return vld1_f16((const float16_t*)p); }
- static inline void store_f16(short* p, float16x4_t v) { vst1_f16((float16_t*)p, v); }
- #else
- static inline float16x4_t load_f16(const short* p) { return (float16x4_t)vld1_s16(p); }
- static inline void store_f16(short* p, float16x4_t v) { vst1_s16(p, (int16x4_t)v); }
- #endif
-#endif
-
// template for FP16 HW conversion function
template<typename T, typename DT> static void
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
#if CV_FP16
for ( ; x <= size.width - 4; x += 4)
{
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
- __m128 v_src = _mm_loadu_ps(src + x);
+ v_float32x4 v_src = v_load(src + x);
- __m128i v_dst = _mm_cvtps_ph(v_src, 0);
+ v_float16x4 v_dst = v_cvt_f16(v_src);
- _mm_storel_epi64((__m128i *)(dst + x), v_dst);
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
- float32x4_t v_src = vld1q_f32(src + x);
-
- float16x4_t v_dst = vcvt_f16_f32(v_src);
-
- store_f16(dst + x, v_dst);
-#else
-#error "Configuration error"
-#endif
+ v_store_f16(dst + x, v_dst);
}
#endif
}
#if CV_FP16
for ( ; x <= size.width - 4; x += 4)
{
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
- __m128i v_src = _mm_loadl_epi64((__m128i*)(src+x));
-
- __m128 v_dst = _mm_cvtph_ps(v_src);
+ v_float16x4 v_src = v_load_f16(src + x);
- _mm_storeu_ps(dst + x, v_dst);
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
- float16x4_t v_src = load_f16(src+x);
+ v_float32x4 v_dst = v_cvt_f32(v_src);
- float32x4_t v_dst = vcvt_f32_f16(v_src);
-
- vst1q_f32(dst + x, v_dst);
-#else
-#error "Configuration error"
-#endif
+ v_store(dst + x, v_dst);
}
#endif
}
+#include "test_precomp.hpp"
#include "test_intrin_utils.hpp"
#include <climits>
return *this;
}
+#if CV_FP16
+ TheTest & test_loadstore_fp16()
+ {
+ AlignedData<R> data;
+ AlignedData<R> out;
+
+ // check if addresses are aligned and unaligned respectively
+ EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+ EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+ EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+ EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+ // check some initialization methods
+ R r1 = data.u;
+ R r2 = v_load_f16(data.a.d);
+ R r3(r2);
+ EXPECT_EQ(data.u[0], r1.get0());
+ EXPECT_EQ(data.a[0], r2.get0());
+ EXPECT_EQ(data.a[0], r3.get0());
+
+ // check some store methods
+ out.a.clear();
+ v_store_f16(out.a.d, r1);
+ EXPECT_EQ(data.a, out.a);
+
+ return *this;
+ }
+
+ TheTest & test_float_cvt_fp16()
+ {
+ AlignedData<v_float32x4> data;
+
+ // check conversion
+ v_float32x4 r1 = v_load(data.a.d);
+ v_float16x4 r2 = v_cvt_f16(r1);
+ v_float32x4 r3 = v_cvt_f32(r2);
+ EXPECT_EQ(0x3c00, r2.get0());
+ EXPECT_EQ(r3.get0(), r1.get0());
+
+ return *this;
+ }
+#endif
+
};
}
#endif
+#if CV_FP16
+TEST(hal_intrin, float16x4) {
+ TheTest<v_float16x4>()
+ .test_loadstore_fp16()
+ .test_float_cvt_fp16()
+ ;
+}
+#endif
+
};
};