From: Francesco Petrogalli <25690309+fpetrogalli@users.noreply.github.com> Date: Thu, 11 Feb 2021 13:24:09 +0000 (+0000) Subject: Merge pull request #19486 from fpetrogalli:dotprod_fast-3.4 X-Git-Tag: submit/tizen/20220120.021815~1^2~1^2~167 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6ee23c9b85e33e0c9af417467e39a8165ef9c89e;p=platform%2Fupstream%2Fopencv.git Merge pull request #19486 from fpetrogalli:dotprod_fast-3.4 * [hal][neon] Optimize the v_dotprod_fast intrinsics for aarch64. On Armv8 in AArch64 execution mode, we can skip the sequence v_(vget_high_(x), vget_high_(y)) in favour of v_high_(x, y) This has better changes for recent compilers to use less data movement operations and better register allocation. See for example: https://godbolt.org/z/bPq7vd * [hal][neon] Fix build failure on armv7. * [hal][neon] Address review comments in PR. PR: https://github.com/opencv/opencv/pull/19486 * [hal][neon] Define macro to check for the AArch64 execution state of Armv8. * [hal][neon] Fix macro definition for AArch64. The fix is needed to prevent warnings when building for Armv7. --- diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 280691b448..06e70b0c30 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD128_64F 0 #endif +// The following macro checks if the code is being compiled for the +// AArch64 execution state of Armv8, to enable the 128-bit +// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by +// the Arm C Language Extension (ACLE) specifications [1] to check the +// availability of 128-bit intrinsics, and it is supporrted by clang +// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft +// Visual Studio [2] . +// +// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics +// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros +#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64) +#define CV_NEON_AARCH64 1 +#else +#define CV_NEON_AARCH64 0 +#endif + // TODO #define CV_NEON_DOT 0 @@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, // 16 >> 32 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) { +#if CV_NEON_AARCH64 + int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); + return v_int32x4(vmlal_high_s16(p, a.val, b.val)); +#else int16x4_t a0 = vget_low_s16(a.val); int16x4_t a1 = vget_high_s16(a.val); int16x4_t b0 = vget_low_s16(b.val); int16x4_t b1 = vget_high_s16(b.val); int32x4_t p = vmull_s16(a0, b0); return v_int32x4(vmlal_s16(p, a1, b1)); +#endif } inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) { +#if CV_NEON_AARCH64 + int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val)); + return v_int32x4(vmlal_high_s16(p, a.val, b.val)); +#else int16x4_t a0 = vget_low_s16(a.val); int16x4_t a1 = vget_high_s16(a.val); int16x4_t b0 = vget_low_s16(b.val); int16x4_t b1 = vget_high_s16(b.val); int32x4_t p = vmlal_s16(c.val, a0, b0); return v_int32x4(vmlal_s16(p, a1, b1)); +#endif } // 32 >> 64 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) { +#if CV_NEON_AARCH64 + int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val)); + return v_int64x2(vmlal_high_s32(p, a.val, b.val)); +#else int32x2_t a0 = vget_low_s32(a.val); int32x2_t a1 = vget_high_s32(a.val); int32x2_t b0 = vget_low_s32(b.val); int32x2_t b1 = vget_high_s32(b.val); int64x2_t p = vmull_s32(a0, b0); return v_int64x2(vmlal_s32(p, a1, b1)); +#endif } inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) { +#if CV_NEON_AARCH64 + int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val)); + return v_int64x2(vmlal_high_s32(p, a.val, b.val)); +#else int32x2_t a0 = vget_low_s32(a.val); int32x2_t a1 = vget_high_s32(a.val); int32x2_t b0 = vget_low_s32(b.val); int32x2_t b1 = vget_high_s32(b.val); int64x2_t p = vmlal_s32(c.val, a0, b0); return v_int64x2(vmlal_s32(p, a1, b1)); +#endif } // 8 >> 32 @@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a) #if CV_SIMD128_64F inline double v_reduce_sum(const v_float64x2& a) { - return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1); + return vaddvq_f64(a.val); } #endif