Merge pull request #20011 from Developer-Ecosystem-Engineering:3.4
authorDeveloper-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>
Tue, 1 Jun 2021 06:39:55 +0000 (23:39 -0700)
committerGitHub <noreply@github.com>
Tue, 1 Jun 2021 06:39:55 +0000 (09:39 +0300)
Improve performance on Arm64

* Improve performance on Apple silicon

This patch will
- Enable dot product intrinsics for macOS arm64 builds
- Enable for macOS arm64 builds
- Improve HAL primitives
  - reduction (sum, min, max, sad)
  - signmask
  - mul_expand
  - check_any / check_all

Results on a M1 Macbook Pro

* Updates to #20011 based on feedback

  - Removes Apple Silicon specific workarounds
  - Makes #ifdef sections smaller for v_mul_expand cases
  - Moves dot product optimization to compiler optimization check
  - Adds 4x4 matrix transpose optimization

* Remove dotprod and fix v_transpose

Based on the latest, we've removed dotprod entirely and will revisit in a future PR.

Added explicit cats with v_transpose4x4()

This should resolve all opens with this PR

* Remove commented out lines

Remove two extraneous comments

modules/core/include/opencv2/core/hal/intrin_neon.hpp
modules/core/test/test_intrin_utils.hpp

index 785648575a60477987b19d9720c7a9b387be8c0d..e17972a3fc4a98a3630605db1af13e2fdc3a9210 100644 (file)
@@ -538,49 +538,81 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
                          v_int16x8& c, v_int16x8& d)
 {
     c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
                          v_uint16x8& c, v_uint16x8& d)
 {
     c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
                          v_int32x4& c, v_int32x4& d)
 {
     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
                          v_uint32x4& c, v_uint32x4& d)
 {
     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
                          v_uint64x2& c, v_uint64x2& d)
 {
     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u32(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
     return v_int16x8(vcombine_s16(
                                   vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
-                                  vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
+                                  vshrn_n_s32(
+#if CV_NEON_AARCH64
+                                    vmull_high_s16(a.val, b.val)
+#else // #if CV_NEON_AARCH64
+                                    vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
+#endif // #if CV_NEON_AARCH64
+                                    , 16)
                                  ));
 }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 {
     return v_uint16x8(vcombine_u16(
                                    vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
-                                   vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
+                                   vshrn_n_u32(
+#if CV_NEON_AARCH64
+                                    vmull_high_u16(a.val, b.val)
+#else // #if CV_NEON_AARCH64
+                                    vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
+#endif // #if CV_NEON_AARCH64
+                                    , 16)
                                   ));
 }
 
@@ -1254,29 +1286,56 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 
 inline unsigned v_reduce_sum(const v_uint8x16& a)
 {
+#if CV_NEON_AARCH64
+    uint16_t t0 = vaddlvq_u8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_reduce_sum(const v_int8x16& a)
 {
+#if CV_NEON_AARCH64
+    int16_t t0 = vaddlvq_s8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
     return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sum(const v_uint16x8& a)
 {
+#if CV_NEON_AARCH64
+    uint32_t t0 = vaddlvq_u16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(a.val);
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_reduce_sum(const v_int16x8& a)
 {
+#if CV_NEON_AARCH64
+    int32_t t0 = vaddlvq_s16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int32x4_t t0 = vpaddlq_s16(a.val);
     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
     return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1285,12 +1344,20 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
     a0 = vp##vectorfunc##_##suffix(a0, a0); \
     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1298,18 +1365,27 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
     a0 = vp##vectorfunc##_##suffix(a0, a0); \
     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
 }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
@@ -1322,9 +1398,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
 
 inline uint64 v_reduce_sum(const v_uint64x2& a)
-{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
+{
+#if CV_NEON_AARCH64
+    return vaddvq_u64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
 inline int64 v_reduce_sum(const v_int64x2& a)
-{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
+{
+#if CV_NEON_AARCH64
+    return vaddvq_s64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
 #if CV_SIMD128_64F
 inline double v_reduce_sum(const v_float64x2& a)
 {
@@ -1335,6 +1423,11 @@ inline double v_reduce_sum(const v_float64x2& a)
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                  const v_float32x4& c, const v_float32x4& d)
 {
+#if CV_NEON_AARCH64
+    float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
+    float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
+    return v_float32x4(vpaddq_f32(ab, cd));  // sumA sumB sumC sumD
+#else // #if CV_NEON_AARCH64
     float32x4x2_t ab = vtrnq_f32(a.val, b.val);
     float32x4x2_t cd = vtrnq_f32(c.val, d.val);
 
@@ -1345,49 +1438,91 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
     float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
 
     return v_float32x4(vaddq_f32(v0, v1));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
 {
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vabdq_u8(a.val, b.val);
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vabdq_u16(a.val, b.val);
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vabdq_u32(a.val, b.val);
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
+#if CV_NEON_AARCH64
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    return vaddvq_f32(t0);
+#else // #if CV_NEON_AARCH64
     float32x4_t t0 = vabdq_f32(a.val, b.val);
     float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
     return vget_lane_f32(vpadd_f32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 
 inline v_uint8x16 v_popcount(const v_uint8x16& a)
@@ -1409,30 +1544,54 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
 
 inline int v_signmask(const v_uint8x16& a)
 {
+#if CV_NEON_AARCH64
+    const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
+    const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
+    uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
+    uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
+    return t0;
+#else // #if CV_NEON_AARCH64
     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+#endif // #if CV_NEON_AARCH64
 }
+
 inline int v_signmask(const v_int8x16& a)
 { return v_signmask(v_reinterpret_as_u8(a)); }
 
 inline int v_signmask(const v_uint16x8& a)
 {
+#if CV_NEON_AARCH64
+    const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
+    uint32_t t0 = vaddlvq_u16(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_signmask(const v_int16x8& a)
 { return v_signmask(v_reinterpret_as_u16(a)); }
 
 inline int v_signmask(const v_uint32x4& a)
 {
+#if CV_NEON_AARCH64
+    const int32x4_t signPosition = {0,1,2,3};
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
+    uint32_t t0 = vaddvq_u32(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(v0);
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_signmask(const v_int32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
@@ -1440,9 +1599,16 @@ inline int v_signmask(const v_float32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
 inline int v_signmask(const v_uint64x2& a)
 {
+#if CV_NEON_AARCH64
+    const int64x2_t signPosition = {0,1};
+    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
+    uint64_t t0 = vaddvq_u64(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int64x1_t m0 = vdup_n_s64(0);
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_signmask(const v_int64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
@@ -1464,19 +1630,31 @@ inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signma
 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
 #endif
 
-#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
-inline bool v_check_all(const v_##_Tpvec& a) \
-{ \
-    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
-    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
-    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
-} \
-inline bool v_check_any(const v_##_Tpvec& a) \
-{ \
-    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
-    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
-    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
-}
+#if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        return (vminvq_##suffix(a.val) >> shift) != 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        return (vmaxvq_##suffix(a.val) >> shift) != 0; \
+    }
+#else // #if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+    }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
@@ -1829,6 +2007,37 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 }
 #endif
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* -- Pass 1: 64b transpose */ \
+    _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    /* -- Pass 2: 32b transpose */ \
+    b0.val = vtrn1q_##suffix##32(t0, t1); \
+    b1.val = vtrn2q_##suffix##32(t0, t1); \
+    b2.val = vtrn1q_##suffix##32(t2, t3); \
+    b3.val = vtrn2q_##suffix##32(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
@@ -1854,6 +2063,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+#endif // #if CV_NEON_AARCH64
 
 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
index 269ebe0f2a402e54f525eed325fe0fbfb184739f..5c22caaf12e599d2c4fd712b0d9cbd4cbb610d1a 100644 (file)
@@ -577,6 +577,25 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_mul_hi()
+    {
+        // typedef typename V_RegTraits<R>::w_reg Rx2;
+        Data<R> dataA, dataB(32767);
+        R a = dataA, b = dataB;
+
+        R c = v_mul_hi(a, b);
+
+        Data<R> resC = c;
+        const int n = R::nlanes / 2;
+        for (int i = 0; i < n; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ((typename R::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
+        }
+
+        return *this;
+    }
+
     TheTest & test_abs()
     {
         typedef typename V_RegTraits<R>::u_reg Ru;
@@ -1663,6 +1682,7 @@ void test_hal_intrin_uint16()
         .test_arithm_wrap()
         .test_mul()
         .test_mul_expand()
+        .test_mul_hi()
         .test_cmp()
         .test_shift<1>()
         .test_shift<8>()
@@ -1697,6 +1717,7 @@ void test_hal_intrin_int16()
         .test_arithm_wrap()
         .test_mul()
         .test_mul_expand()
+        .test_mul_hi()
         .test_cmp()
         .test_shift<1>()
         .test_shift<8>()