Merge pull request #19486 from fpetrogalli:dotprod_fast-3.4

author Francesco Petrogalli <25690309+fpetrogalli@users.noreply.github.com>

Thu, 11 Feb 2021 13:24:09 +0000 (13:24 +0000)

committer GitHub <noreply@github.com>

Thu, 11 Feb 2021 13:24:09 +0000 (13:24 +0000)
author Francesco Petrogalli <25690309+fpetrogalli@users.noreply.github.com>
Thu, 11 Feb 2021 13:24:09 +0000 (13:24 +0000)
committer GitHub <noreply@github.com>
Thu, 11 Feb 2021 13:24:09 +0000 (13:24 +0000)
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp

index 280691b44823bc6ee0564cc959b61af2c34a508c..06e70b0c303c54aba3e015e7c97d4c5919522882 100644 (file)
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  #define CV_SIMD128_64F 0
  #endif
  
+// The following macro checks if the code is being compiled for the
+// AArch64 execution state of Armv8, to enable the 128-bit
+// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
+// the Arm C Language Extension (ACLE) specifications [1] to check the
+// availability of 128-bit intrinsics, and it is supporrted by clang
+// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
+// Visual Studio [2] .
+//
+// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
+// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+#define CV_NEON_AARCH64 1
+#else
+#define CV_NEON_AARCH64 0
+#endif
+
  // TODO
  #define CV_NEON_DOT 0
  
@@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
  // 16 >> 32
  inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
  {
+#if CV_NEON_AARCH64
+    int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
      int16x4_t a0 = vget_low_s16(a.val);
      int16x4_t a1 = vget_high_s16(a.val);
      int16x4_t b0 = vget_low_s16(b.val);
      int16x4_t b1 = vget_high_s16(b.val);
      int32x4_t p = vmull_s16(a0, b0);
      return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
  }
  inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
  {
+#if CV_NEON_AARCH64
+    int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
      int16x4_t a0 = vget_low_s16(a.val);
      int16x4_t a1 = vget_high_s16(a.val);
      int16x4_t b0 = vget_low_s16(b.val);
      int16x4_t b1 = vget_high_s16(b.val);
      int32x4_t p = vmlal_s16(c.val, a0, b0);
      return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
  }
  
  // 32 >> 64
  inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
  {
+#if CV_NEON_AARCH64
+    int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
      int32x2_t a0 = vget_low_s32(a.val);
      int32x2_t a1 = vget_high_s32(a.val);
      int32x2_t b0 = vget_low_s32(b.val);
      int32x2_t b1 = vget_high_s32(b.val);
      int64x2_t p = vmull_s32(a0, b0);
      return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
  }
  inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
  {
+#if CV_NEON_AARCH64
+    int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
      int32x2_t a0 = vget_low_s32(a.val);
      int32x2_t a1 = vget_high_s32(a.val);
      int32x2_t b0 = vget_low_s32(b.val);
      int32x2_t b1 = vget_high_s32(b.val);
      int64x2_t p = vmlal_s32(c.val, a0, b0);
      return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
  }
  
  // 8 >> 32
@@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a)
  #if CV_SIMD128_64F
  inline double v_reduce_sum(const v_float64x2& a)
  {
-    return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
+    return vaddvq_f64(a.val);
  }
  #endif
author	Francesco Petrogalli <25690309+fpetrogalli@users.noreply.github.com>
	Thu, 11 Feb 2021 13:24:09 +0000 (13:24 +0000)
committer	GitHub <noreply@github.com>
	Thu, 11 Feb 2021 13:24:09 +0000 (13:24 +0000)