From 971a7c88dc9329873b43a61612100d3231ae4732 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Wed, 19 Oct 2016 18:22:33 +0900 Subject: [PATCH] let the performance test pass on ARM * use round-to-neaerest in div of carotene --- 3rdparty/carotene/src/div.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp index 9c03202..dbd60e7 100644 --- a/3rdparty/carotene/src/div.cpp +++ b/3rdparty/carotene/src/div.cpp @@ -51,6 +51,13 @@ namespace { #ifdef CAROTENE_NEON +inline float32x4_t vroundq(const float32x4_t& v) +{ + const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); + float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v)))); + return vaddq_f32(v, v_addition); +} + template inline T divSaturateQ(const T &v1, const T &v2, const float scale) { @@ -62,10 +69,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale) } template <> inline int32x4_t divSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) -{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } +{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); } template <> inline uint32x4_t divSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) -{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } +{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); } template inline T divSaturate(const T &v1, const T &v2, const float scale) -- 2.7.4