set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
-list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
elseif(ARM OR AARCH64)
ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+ ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp")
if(NOT AARCH64)
ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
if(NOT MSVC)
endif()
ocv_update(CPU_FP16_IMPLIES "NEON")
else()
- ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16")
+ ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD")
ocv_update(CPU_NEON_FLAGS_ON "")
ocv_update(CPU_FP16_IMPLIES "NEON")
+ ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
+ ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
endif()
elseif(MIPS)
#endif
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
+# define CV_TRY_NEON_DOTPROD 1
+# define CV_CPU_FORCE_NEON_DOTPROD 1
+# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
+# define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
+# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
+# define CV_TRY_NEON_DOTPROD 1
+# define CV_CPU_FORCE_NEON_DOTPROD 0
+# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
+# define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#else
+# define CV_TRY_NEON_DOTPROD 0
+# define CV_CPU_FORCE_NEON_DOTPROD 0
+# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
+# define CV_CPU_CALL_NEON_DOTPROD(fn, args)
+# define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...) CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
# define CV_TRY_MSA 1
# define CV_CPU_FORCE_MSA 1
#define CV_NEON_AARCH64 0
#endif
-// TODO
-#define CV_NEON_DOT 0
//////////// Utils ////////////
}
// 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \
+{ \
+ return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
+} \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+ return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4, v_int8x16, s32)
+#else
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
{
-#if CV_NEON_DOT
- return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
return v_uint32x4(vaddq_u32(s0, s1));
-#endif
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
const v_uint32x4& c)
{
-#if CV_NEON_DOT
- return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
return v_dotprod_expand(a, b) + c;
-#endif
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
-#if CV_NEON_DOT
- return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
int16x8_t uzp1, uzp2;
int16x4_t uzpl1, uzpl2;
_v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
return v_int32x4(vaddl_s16(uzpl1, uzpl2));
-#endif
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
const v_int32x4& c)
{
-#if CV_NEON_DOT
- return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
return v_dotprod_expand(a, b) + c;
-#endif
}
-
+#endif
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
{
}
// 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \
+{ \
+ return v_dotprod_expand(a, b); \
+} \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+ return v_dotprod_expand(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4, v_int8x16, s32)
+#else
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
{
-#if CV_NEON_DOT
- return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
return v_uint32x4(vaddq_u32(s0, s1));
-#endif
}
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{
-#if CV_NEON_DOT
- return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
return v_dotprod_expand_fast(a, b) + c;
-#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
-#if CV_NEON_DOT
- return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
-#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{
-#if CV_NEON_DOT
- return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
return v_dotprod_expand_fast(a, b) + c;
-#endif
}
+#endif
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)