Add v_expand for AArch64, fuse vmovl+vget_high into vmovl_high
authorJonathan Deakin <jonathan.deakin@arm.com>
Tue, 23 Mar 2021 15:06:41 +0000 (15:06 +0000)
committerJonathan Deakin <jonathan.deakin@arm.com>
Tue, 23 Mar 2021 15:06:41 +0000 (15:06 +0000)
modules/core/include/opencv2/core/hal/intrin_neon.hpp

index 06e70b0c303c54aba3e015e7c97d4c5919522882..785648575a60477987b19d9720c7a9b387be8c0d 100644 (file)
@@ -1539,6 +1539,26 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
 #endif
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_high_##suffix(a.val); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+#else
 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
 { \
@@ -1557,6 +1577,7 @@ inline _Tpwvec v_load_expand(const _Tp* ptr) \
 { \
     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
 }
+#endif
 
 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)