{
int x, c, width = img1.cols, cn = img1.channels();
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
- int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
+ int D = (int)alignSize(maxD - minD, VTraits<v_int16>::vlanes()), width1 = maxX1 - minX1;
//This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
//That is needs of parallel algorithm
xrange_min = (xrange_min < 0) ? 0: xrange_min;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
const int D = params.numDisparities;
int width1 = maxX1 - minX1;
- int Da = (int)alignSize(D, v_int16::nlanes);
- int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+ int Da = (int)alignSize(D,VTraits<v_int16>::vlanes());
+ int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
int npasses = params.isFullDP() ? 2 : 1;
width = img1.cols;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
D = maxD - minD;
- Da = (int)alignSize(D, v_int16::nlanes);
- Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+ Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
+ Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1;
D = params.numDisparities;
- Da = (int)alignSize(D, v_int16::nlanes);
}
void operator()(const Range& range) const CV_OVERRIDE
INVALID_DISP = minD - 1;
INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
D = maxD - minD;
- Da = (int)alignSize(D, v_int16::nlanes);
- Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+ Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
+ Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
width1 = maxX1 - minX1;
}
int width = disp1.cols, height = disp1.rows;
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
int width1 = maxX1 - minX1;
- int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
- int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
+ int Da = (int)alignSize(params.numDisparities, VTraits<v_int16>::vlanes());
+ int Dlra = Da + VTraits<v_int16>::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
int INVALID_DISP = minD - 1;
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
width = img1->cols; height = img1->rows;
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
- Da = (int)alignSize(D, v_int16::nlanes);
+ Da = (int)alignSize(D, VTraits<v_int16>::vlanes());
SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
# undef CV_RVV
#endif
-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_wasm.hpp"
-#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
+#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE)
#include "opencv2/core/hal/intrin_rvv.hpp"
-
+#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE
+#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
#else
#include "opencv2/core/hal/intrin_cpp.hpp"
#define CV_SIMD512_FP16 0
#endif
+#ifndef CV_SIMD_SCALABLE
+#define CV_SIMD_SCALABLE 0
+#endif
+
+#ifndef CV_SIMD_SCALABLE_64F
+#define CV_SIMD_SCALABLE_64F 0
+#endif
+
//==================================================================================================
template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
#endif
+#if CV_SIMD_SCALABLE
+ CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
+ CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
+ CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
+ CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
+ CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
+ CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
+ CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
+ CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
+ CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
+ CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
+#endif
//! @endcond
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
#define VXPREFIX(func) v##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
+
+#elif CV_SIMD_SCALABLE
+#define CV__SIMD_NAMESPACE simd
+namespace CV__SIMD_NAMESPACE {
+ #define CV_SIMD 0
+ #define CV_SIMD_WIDTH 128 /* 1024/8 */
+
+ #define VXPREFIX(func) v##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+
#endif
namespace CV__SIMD_NAMESPACE {
/** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
+#if CV_SIMD
+ // Compatibility layer
+ #define CV_SIMD_SCALABLE 0
+ #define CV_SIMD_SCALABLE_64F 0
+
+ template <class T>
+ struct VTraits;
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+ template <>
+ struct VTraits<v_uint8>
+ {
+ static inline int vlanes() { return v_uint8::nlanes; }
+ enum { nlanes = 64, max_nlanes = nlanes };
+ using lane_type = uchar;
+ };
+ template <>
+ struct VTraits<v_int8>
+ {
+ static inline int vlanes() { return v_int8::nlanes; }
+ enum { nlanes = 64, max_nlanes = nlanes };
+ using lane_type = schar;
+ };
+ template <>
+ struct VTraits<v_uint16>
+ {
+ static inline int vlanes() { return v_uint16::nlanes; }
+ enum { nlanes = 32, max_nlanes = nlanes };
+ using lane_type = ushort;
+ };
+ template <>
+ struct VTraits<v_int16>
+ {
+ static inline int vlanes() { return v_int16::nlanes; }
+ enum { nlanes = 32, max_nlanes = nlanes };
+ using lane_type = short;
+ };
+ template <>
+ struct VTraits<v_uint32>
+ {
+ static inline int vlanes() { return v_uint32::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = uint;
+ };
+ template <>
+ struct VTraits<v_int32>
+ {
+ static inline int vlanes() { return v_int32::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = int;
+ };
+
+ template <>
+ struct VTraits<v_float32>
+ {
+ static inline int vlanes() { return v_float32::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = float;
+ };
+ template <>
+ struct VTraits<v_uint64>
+ {
+ static inline int vlanes() { return v_uint64::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = uint64;
+ };
+ template <>
+ struct VTraits<v_int64>
+ {
+ static inline int vlanes() { return v_int64::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = int64;
+ };
+ #if CV_SIMD_64F
+ template <>
+ struct VTraits<v_float64>
+ {
+ static inline int vlanes() { return v_float64::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = double;
+ };
+ #endif
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+ template <>
+ struct VTraits<v_uint8>
+ {
+ static inline int vlanes() { return v_uint8::nlanes; }
+ enum { nlanes = 32, max_nlanes = nlanes };
+ using lane_type = uchar;
+ };
+ template <>
+ struct VTraits<v_int8>
+ {
+ static inline int vlanes() { return v_int8::nlanes; }
+ enum { nlanes = 32, max_nlanes = nlanes };
+ using lane_type = schar;
+ };
+ template <>
+ struct VTraits<v_uint16>
+ {
+ static inline int vlanes() { return v_uint16::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = ushort;
+ };
+ template <>
+ struct VTraits<v_int16>
+ {
+ static inline int vlanes() { return v_int16::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = short;
+ };
+ template <>
+ struct VTraits<v_uint32>
+ {
+ static inline int vlanes() { return v_uint32::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = uint;
+ };
+ template <>
+ struct VTraits<v_int32>
+ {
+ static inline int vlanes() { return v_int32::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = int;
+ };
+
+ template <>
+ struct VTraits<v_float32>
+ {
+ static inline int vlanes() { return v_float32::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = float;
+ };
+ template <>
+ struct VTraits<v_uint64>
+ {
+ static inline int vlanes() { return v_uint64::nlanes; }
+ enum { nlanes = 4, max_nlanes = nlanes };
+ using lane_type = uint64;
+ };
+ template <>
+ struct VTraits<v_int64>
+ {
+ static inline int vlanes() { return v_int64::nlanes; }
+ enum { nlanes = 4, max_nlanes = nlanes };
+ using lane_type = int64;
+ };
+ #if CV_SIMD_64F
+ template <>
+ struct VTraits<v_float64>
+ {
+ static inline int vlanes() { return v_float64::nlanes; }
+ enum { nlanes = 4, max_nlanes = nlanes };
+ using lane_type = double;
+ };
+ #endif
+#elif CV_SIMD128 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+ template <>
+ struct VTraits<v_uint8>
+ {
+ static inline int vlanes() { return v_uint8::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = uchar;
+ };
+ template <>
+ struct VTraits<v_int8>
+ {
+ static inline int vlanes() { return v_int8::nlanes; }
+ enum { nlanes = 16, max_nlanes = nlanes };
+ using lane_type = schar;
+ };
+ template <>
+ struct VTraits<v_uint16>
+ {
+ static inline int vlanes() { return v_uint16::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = ushort;
+ };
+ template <>
+ struct VTraits<v_int16>
+ {
+ static inline int vlanes() { return v_int16::nlanes; }
+ enum { nlanes = 8, max_nlanes = nlanes };
+ using lane_type = short;
+ };
+ template <>
+ struct VTraits<v_uint32>
+ {
+ static inline int vlanes() { return v_uint32::nlanes; }
+ enum { nlanes = 4, max_nlanes = nlanes };
+ using lane_type = uint;
+ };
+ template <>
+ struct VTraits<v_int32>
+ {
+ static inline int vlanes() { return v_int32::nlanes; }
+ enum { nlanes = 4, max_nlanes = nlanes };
+ using lane_type = int;
+ };
+
+ template <>
+ struct VTraits<v_float32>
+ {
+ static inline int vlanes() { return v_float32::nlanes; }
+ enum { nlanes = 4, max_nlanes = nlanes };
+ using lane_type = float;
+ };
+ template <>
+ struct VTraits<v_uint64>
+ {
+ static inline int vlanes() { return v_uint64::nlanes; }
+ enum { nlanes = 2, max_nlanes = nlanes };
+ using lane_type = uint64;
+ };
+ template <>
+ struct VTraits<v_int64>
+ {
+ static inline int vlanes() { return v_int64::nlanes; }
+ enum { nlanes = 2, max_nlanes = nlanes };
+ using lane_type = int64;
+ };
+ #if CV_SIMD_64F
+ template <>
+ struct VTraits<v_float64>
+ {
+ static inline int vlanes() { return v_float64::nlanes; }
+ enum { nlanes = 2, max_nlanes = nlanes };
+ using lane_type = double;
+ };
+ #endif
+#endif
+
+ #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+ inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a + b; \
+ } \
+ inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a - b; \
+ } \
+ template<typename... Args> \
+ inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \
+ return v_add(f1 + f2, vf...); \
+ }
+
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+ #if CV_SIMD_64F
+ OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+ #endif
+
+ #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
+ inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a & b; \
+ } \
+ inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a | b; \
+ } \
+ inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a ^ b; \
+ } \
+ inline _Tpvec v_not(const _Tpvec& a) \
+ { \
+ return ~a; \
+ }
+
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
+ OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+
+
+ #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+ inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a * b; \
+ } \
+ template<typename... Args> \
+ inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \
+ return v_mul(f1 * f2, vf...); \
+ }
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+ #if CV_SIMD_64F
+ OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+ #endif
+
+
+ inline v_float32 v_div(const v_float32& a, const v_float32& b) \
+ { \
+ return a / b; \
+ }
+ #if CV_SIMD_64F
+ inline v_float64 v_div(const v_float64& a, const v_float64& b) \
+ { \
+ return a / b; \
+ }
+ #endif
+
+ #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
+ inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
+ { \
+ return a op b; \
+ }
+
+ #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
+ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
+ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
+ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
+ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
+ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
+ OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
+
+ OPENCV_HAL_WRAP_CMP(v_uint8)
+ OPENCV_HAL_WRAP_CMP(v_uint16)
+ OPENCV_HAL_WRAP_CMP(v_uint32)
+ // OPENCV_HAL_WRAP_CMP(v_uint64)
+ OPENCV_HAL_WRAP_CMP(v_int8)
+ OPENCV_HAL_WRAP_CMP(v_int16)
+ OPENCV_HAL_WRAP_CMP(v_int32)
+ // OPENCV_HAL_WRAP_CMP(v_int64)
+ OPENCV_HAL_WRAP_CMP(v_float32)
+ #if CV_SIMD_64F
+ OPENCV_HAL_WRAP_CMP(v_float64)
+ #endif
+
+ //////////// get0 ////////////
+ #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
+ inline _Tp v_get0(v_##_Tpvec v) \
+ { \
+ return v.get0(); \
+ }
+
+ OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
+ OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
+ OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
+ OPENCV_HAL_WRAP_GRT0_INT(int16, short)
+ OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
+ OPENCV_HAL_WRAP_GRT0_INT(int32, int)
+ OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
+ OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
+ OPENCV_HAL_WRAP_GRT0_INT(float32, float)
+ #if CV_SIMD_64F
+ OPENCV_HAL_WRAP_GRT0_INT(float64, double)
+ #endif
+
+ #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
+ inline _Tp v_extract_highest(_Tpvec v) \
+ { \
+ return v_extract_n<vl-1>(v); \
+ }
+
+ OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
+ OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
+ #if CV_SIMD_64F
+ OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
+ #endif
+
+ #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+ inline _Tpvec v_broadcast_highest(_Tpvec v) \
+ { \
+ return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+ }
+
+ OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+ OPENCV_HAL_WRAP_BROADCAST(v_int32)
+ OPENCV_HAL_WRAP_BROADCAST(v_float32)
+
+
+#endif //CV_SIMD
//! @cond IGNORED
--- /dev/null
+
+#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+
+#include <initializer_list>
+#include <assert.h>
+#include <vector>
+
+#ifndef CV_RVV_MAX_VLEN
+#define CV_RVV_MAX_VLEN 1024
+#endif
+
+namespace cv
+{
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD_SCALABLE 1
+#define CV_SIMD_SCALABLE_64F 1
+
+using v_uint8 = vuint8m1_t;
+using v_int8 = vint8m1_t;
+using v_uint16 = vuint16m1_t;
+using v_int16 = vint16m1_t;
+using v_uint32 = vuint32m1_t;
+using v_int32 = vint32m1_t;
+using v_uint64 = vuint64m1_t;
+using v_int64 = vint64m1_t;
+
+using v_float32 = vfloat32m1_t;
+#if CV_SIMD_SCALABLE_64F
+using v_float64 = vfloat64m1_t;
+#endif
+
+using uchar = unsigned char;
+using schar = signed char;
+using ushort = unsigned short;
+using uint = unsigned int;
+using uint64 = unsigned long int;
+using int64 = long int;
+
+static const int __cv_rvv_e8_nlanes = vsetvlmax_e8m1();
+static const int __cv_rvv_e16_nlanes = vsetvlmax_e16m1();
+static const int __cv_rvv_e32_nlanes = vsetvlmax_e32m1();
+static const int __cv_rvv_e64_nlanes = vsetvlmax_e64m1();
+
+template <class T>
+struct VTraits;
+
+template <>
+struct VTraits<v_uint8>
+{
+ static inline int vlanes() { return __cv_rvv_e8_nlanes; }
+ using lane_type = uchar;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/8;
+};
+
+template <>
+struct VTraits<v_int8>
+{
+ static inline int vlanes() { return __cv_rvv_e8_nlanes; }
+ using lane_type = schar;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/8;
+};
+template <>
+struct VTraits<v_uint16>
+{
+ static inline int vlanes() { return __cv_rvv_e16_nlanes; }
+ using lane_type = ushort;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/16;
+};
+template <>
+struct VTraits<v_int16>
+{
+ static inline int vlanes() { return __cv_rvv_e16_nlanes; }
+ using lane_type = short;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/16;
+};
+template <>
+struct VTraits<v_uint32>
+{
+ static inline int vlanes() { return __cv_rvv_e32_nlanes; }
+ using lane_type = uint;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/32;
+};
+template <>
+struct VTraits<v_int32>
+{
+ static inline int vlanes() { return __cv_rvv_e32_nlanes; }
+ using lane_type = int;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/32;
+};
+
+template <>
+struct VTraits<v_float32>
+{
+ static inline int vlanes() { return __cv_rvv_e32_nlanes; }
+ using lane_type = float;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/32;
+};
+template <>
+struct VTraits<v_uint64>
+{
+ static inline int vlanes() { return __cv_rvv_e64_nlanes; }
+ using lane_type = uint64;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/64;
+};
+template <>
+struct VTraits<v_int64>
+{
+ static inline int vlanes() { return __cv_rvv_e64_nlanes; }
+ using lane_type = int64;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/64;
+};
+#if CV_SIMD_SCALABLE_64F
+template <>
+struct VTraits<v_float64>
+{
+ static inline int vlanes() { return __cv_rvv_e64_nlanes; }
+ using lane_type = double;
+ static const int max_nlanes = CV_RVV_MAX_VLEN/64;
+};
+#endif
+
+//////////// get0 ////////////
+#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
+inline _Tp v_get0(v_##_Tpvec v) \
+{ \
+ return vmv_x(v); \
+}
+
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
+
+inline float v_get0(v_float32 v) \
+{ \
+ return vfmv_f(v); \
+}
+#if CV_SIMD_SCALABLE_64F
+inline double v_get0(v_float64 v) \
+{ \
+ return vfmv_f(v); \
+}
+#endif
+
+//////////// Initial ////////////
+
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
+inline v_##_Tpvec v_setzero_##suffix1() \
+{ \
+ return vmv_v_x_##suffix2##m1(0, vl); \
+} \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
+{ \
+ return vmv_v_x_##suffix2##m1(v, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+ return vfmv_v_f_##suffix##m1(0, vl); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+ return vfmv_v_f_##suffix##m1(v, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
+#endif
+
+//////////// Reinterpret ////////////
+#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
+{ \
+ return v;\
+}
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
+#endif
+// TODO: can be simplified by using overloaded RV intrinsic
+#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+ return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+ return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
+#endif
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
+
+
+#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+ return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+ return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
+// Three times reinterpret
+inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
+{ \
+ return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
+}
+
+inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
+{ \
+ return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
+}
+#endif
+
+
+////////////// Load/Store //////////////
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+ return vle##width##_v_##suffix##m1(ptr, vl); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+ return vle##width##_v_##suffix##m1(ptr, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+ vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+ return vle##width##_v_##suffix##m1(ptr, hvl); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+ return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+ vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+ vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+ vse##width(ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+ vse##width(ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+ vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
+} \
+inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \
+{ \
+ assert(nScalars.size() == vl); \
+ return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \
+} \
+template<typename... Targs> \
+_Tpvec v_load_##suffix(Targs... nScalars) \
+{ \
+ return v_load({nScalars...}); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
+#endif
+
+////////////// Lookup table access ////////////////////
+#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
+{ \
+ vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+ return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+} \
+inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
+{ \
+ std::vector<uint> idx_; \
+ for (size_t i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
+ idx_.push_back(idx[i]); \
+ idx_.push_back(idx[i]+1); \
+ } \
+ vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+ return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+} \
+inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
+{ \
+ std::vector<uint> idx_; \
+ for (size_t i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
+ idx_.push_back(idx[i]); \
+ idx_.push_back(idx[i]+1); \
+ idx_.push_back(idx[i]+2); \
+ idx_.push_back(idx[i]+3); \
+ } \
+ vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+ return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
+OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
+OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
+OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
+OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
+
+inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
+
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+ return intrin(a, b, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits<v_int64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits<v_int64>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
+#endif
+
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+ _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+ b0 = vget_##suffix##m1(temp, 0); \
+ b1 = vget_##suffix##m1(temp, 1); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+ _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+ return vget_##suffix##m1(temp, 0); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+ _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+ return vget_##suffix##m1(temp, 1); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+ return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
+
+inline v_uint32 v_load_expand_q(const uchar* ptr)
+{
+ return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
+}
+
+inline v_int32 v_load_expand_q(const schar* ptr)
+{
+ return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
+}
+
+
+////// FP16 support ///////
+
+inline v_float32 v_load_expand(const float16_t* ptr)
+{
+ // TODO
+ return vundefined_f32m1();
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+} //namespace cv
+
+#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
\ No newline at end of file
#endif // SIMD512
+#if CV_SIMD_SCALABLE
+template<typename _T> struct Type2Vec_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
+ template<> struct Type2Vec_Traits<type_> \
+ { \
+ typedef vec_type_ vec_type; \
+ }
+
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
+#if CV_SIMD_SCALABLE_64F
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
+#endif
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
+#if CV_SIMD_SCALABLE_64F
+template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+#endif
-#if CV_SIMD_WIDTH == 16
+
+#if CV_SIMD_SCALABLE
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 16
template<typename _T> static inline
typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
#elif CV_SIMD_WIDTH == 32
template<>
struct op_absdiff<schar, v_int8>
{
+#if CV_SIMD
static inline v_int8 r(const v_int8& a, const v_int8& b)
{ return v_absdiffs(a, b); }
+#endif
static inline schar r(schar a, schar b)
{ return c_absdiff(a, b); }
};
template<>
struct op_absdiff<short, v_int16>
{
+#if CV_SIMD
static inline v_int16 r(const v_int16& a, const v_int16& b)
{ return v_absdiffs(a, b); }
+#endif
static inline short r(short a, short b)
{ return c_absdiff(a, b); }
};
template<>
struct op_absdiff<int, v_int32>
{
+#if CV_SIMD
static inline v_int32 r(const v_int32& a, const v_int32& b)
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
+#endif
static inline int r(int a, int b)
{ return c_absdiff(a, b); }
};
template<typename T1, typename T2, typename Tvec>
struct op_mul_scale
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar * a * b;
}
+#endif
static inline T1 r(T1 a, T1 b, const T2* scalar)
{ return c_mul(a, b, *scalar); }
static inline Tvec pre(const Tvec&, const Tvec& res)
template<typename T1, typename T2, typename Tvec>
struct op_div_scale
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
return v_select(denom == v_zero, v_zero, res);
}
+#endif
static inline T1 r(T1 a, T1 denom, const T2* scalar)
{
CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
template<>
struct op_div_scale<float, float, v_float32>
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return a * v_scalar / b;
}
+#endif
static inline float r(float a, float denom, const float* scalar)
{ return c_div(a, denom, *scalar); }
};
template<typename T1, typename T2, typename Tvec>
struct op_add_scale
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_alpha = vx_setall_f32(*scalar);
return v_fma(a, v_alpha, b);
}
+#endif
static inline T1 r(T1 a, T1 b, const T2* scalar)
{ return c_add(a, b, *scalar); }
static inline Tvec pre(const Tvec&, const Tvec& res)
template<typename T1, typename T2, typename Tvec>
struct op_add_weighted
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
{
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
const v_float32 v_gamma = vx_setall_f32(scalars[2]);
return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
}
+#endif
static inline T1 r(T1 a, T1 b, const T2* scalars)
{ return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
static inline Tvec pre(const Tvec&, const Tvec& res)
template<typename T1, typename T2, typename Tvec>
struct op_recip
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
return v_select(denom == v_zero, v_zero, res);
}
+#endif
static inline T1 r(T1 denom, const T2* scalar)
{
CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
template<>
struct op_recip<float, float, v_float32>
{
+#if CV_SIMD
static inline v_float32 r(const v_float32& a, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar / a;
}
+#endif
static inline float r(float denom, const float* scalar)
{ return c_div(*scalar, denom); }
};
#include "opencv2/core/hal/intrin.hpp"
#undef CV__SIMD_FORCE_WIDTH
-#if CV_SIMD_WIDTH != 16
+#if CV_SIMD && CV_SIMD_WIDTH != 16
#error "Invalid build configuration"
#endif
template <typename R> struct Data;
template <int N> struct initializer;
+#if CV_SIMD
template <> struct initializer<64>
{
template <typename R> static R init(const Data<R> & d)
}
};
+#else
+
+template <> struct initializer<128>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+ d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
+ d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
+ d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63],
+ d[64], d[65], d[66], d[67], d[68], d[69], d[70], d[71], d[72], d[73], d[74], d[75], d[76], d[77], d[78], d[79],
+ d[80], d[81], d[82], d[83], d[84], d[85], d[86], d[87], d[88], d[89], d[90], d[91], d[92], d[93], d[94], d[95],
+ d[96], d[97], d[98], d[99], d[100], d[101], d[102], d[103], d[104], d[105], d[106], d[107], d[108], d[109], d[110], d[111],
+ d[112], d[113], d[114], d[115], d[116], d[117], d[118], d[119], d[120], d[121], d[122], d[123], d[124], d[125], d[126], d[127]});
+ }
+};
+
+template <> struct initializer<64>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+ d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
+ d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
+ d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]});
+ }
+};
+
+template <> struct initializer<32>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
+ d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]});
+ }
+};
+
+template <> struct initializer<16>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]});
+ }
+};
+
+template <> struct initializer<8>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]});
+ }
+};
+
+template <> struct initializer<4>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1], d[2], d[3]});
+ }
+};
+
+template <> struct initializer<2>
+{
+ template <typename R> static R init(const Data<R> & d)
+ {
+ return v_load({d[0], d[1]});
+ }
+};
+#endif
//==================================================================================================
template <typename R> struct Data
{
- typedef typename R::lane_type LaneType;
+ typedef typename VTraits<R>::lane_type LaneType;
typedef typename V_TypeTraits<LaneType>::int_type int_type;
Data()
{
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
d[i] = (LaneType)(i + 1);
}
Data(LaneType val)
}
operator R ()
{
- return initializer<R::nlanes>().init(*this);
+ return initializer<VTraits<R>::max_nlanes>().init(*this);
}
Data<R> & operator=(const R & r)
{
}
template <typename T> Data<R> & operator*=(T m)
{
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
d[i] *= (LaneType)m;
return *this;
}
template <typename T> Data<R> & operator+=(T m)
{
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
d[i] += (LaneType)m;
return *this;
}
- void fill(LaneType val, int s, int c = R::nlanes)
+ void fill(LaneType val, int s, int c = VTraits<R>::vlanes())
{
for (int i = s; i < c; ++i)
d[i] = val;
}
void reverse()
{
- for (int i = 0; i < R::nlanes / 2; ++i)
- std::swap(d[i], d[R::nlanes - i - 1]);
+ for (int i = 0; i < VTraits<R>::vlanes() / 2; ++i)
+ std::swap(d[i], d[VTraits<R>::vlanes() - i - 1]);
}
const LaneType & operator[](int i) const
{
#if 0 // TODO: strange bug - AVX2 tests are failed with this
- CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+ CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits<R>::vlanes(), "");
#else
- CV_Assert(i >= 0 && i < R::nlanes);
+ CV_Assert(i >= 0 && i < VTraits<R>::max_nlanes);
#endif
return d[i];
}
LaneType & operator[](int i)
{
- CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+ CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits<R>::max_nlanes, "");
return d[i];
}
int_type as_int(int i) const
{
- CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+ CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits<R>::max_nlanes, "");
union
{
LaneType l;
}
const LaneType * mid() const
{
- return d + R::nlanes / 2;
+ return d + VTraits<R>::vlanes() / 2;
}
LaneType * mid()
{
- return d + R::nlanes / 2;
+ return d + VTraits<R>::vlanes() / 2;
}
LaneType sum(int s, int c)
{
}
LaneType sum()
{
- return sum(0, R::nlanes);
+ return sum(0, VTraits<R>::vlanes());
}
bool operator==(const Data<R> & other) const
{
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
if (d[i] != other.d[i])
return false;
return true;
}
bool isValue(uchar val) const
{
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
if (d[i] != val)
return false;
return true;
}
- LaneType d[R::nlanes];
+ LaneType d[VTraits<R>::max_nlanes];
};
template<typename R> struct AlignedData
{
- Data<R> CV_DECL_ALIGNED(CV_SIMD_WIDTH) a; // aligned
+ Data<R> CV_DECL_ALIGNED(sizeof(typename VTraits<R>::lane_type)*VTraits<R>::max_nlanes) a; // aligned
char dummy;
Data<R> u; // unaligned
};
template <typename R> std::ostream & operator<<(std::ostream & out, const Data<R> & d)
{
out << "{ ";
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
- // out << std::hex << +V_TypeTraits<typename R::lane_type>::reinterpret_int(d.d[i]);
+ // out << std::hex << +V_TypeTraits<typename VTraits<R>::lane_type>::reinterpret_int(d.d[i]);
out << +d.d[i];
- if (i + 1 < R::nlanes)
+ if (i + 1 < VTraits<R>::vlanes())
out << ", ";
}
out << " }";
template<typename R> struct TheTest
{
- typedef typename R::lane_type LaneType;
+ typedef typename VTraits<R>::lane_type LaneType;
template <typename T1, typename T2>
static inline void EXPECT_COMPARE_EQ(const T1 a, const T2 b)
AlignedData<R> out;
// check if addresses are aligned and unaligned respectively
- EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
- EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
- EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
- EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
+ EXPECT_EQ((size_t)0, (size_t)&data.a.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
+ EXPECT_NE((size_t)0, (size_t)&data.u.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
+ EXPECT_EQ((size_t)0, (size_t)&out.a.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
+ EXPECT_NE((size_t)0, (size_t)&out.u.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
// check some initialization methods
R r1 = data.a;
R r2 = vx_load(data.u.d);
R r3 = vx_load_aligned(data.a.d);
R r4(r2);
- EXPECT_EQ(data.a[0], r1.get0());
- EXPECT_EQ(data.u[0], r2.get0());
- EXPECT_EQ(data.a[0], r3.get0());
- EXPECT_EQ(data.u[0], r4.get0());
+ EXPECT_EQ(data.a[0], v_get0(r1));
+ EXPECT_EQ(data.u[0], v_get0(r2));
+ EXPECT_EQ(data.a[0], v_get0(r3));
+ EXPECT_EQ(data.u[0], v_get0(r4));
R r_low = vx_load_low((LaneType*)data.u.d);
- EXPECT_EQ(data.u[0], r_low.get0());
+ EXPECT_EQ(data.u[0], v_get0(r_low));
v_store(out.u.d, r_low);
- for (int i = 0; i < R::nlanes/2; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes()/2; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
}
- R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (CV_SIMD_WIDTH / 2)));
- EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0());
+ R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes() / 2)));
+ EXPECT_EQ(data.u[VTraits<R>::vlanes()/2], v_get0(r_low_align8byte));
v_store(out.u.d, r_low_align8byte);
- for (int i = 0; i < R::nlanes/2; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes()/2; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
+ EXPECT_EQ((LaneType)data.u[i + VTraits<R>::vlanes()/2], (LaneType)out.u[i]);
}
// check some store methods
Data<R> resZ, resV;
resZ.fill((LaneType)0);
resV.fill((LaneType)8);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((LaneType)0, resZ[i]);
#elif CV_SIMD_WIDTH == 64
R setall_res1 = v512_setall((LaneType)5);
R setall_res2 = v512_setall<LaneType>(6);
+#elif CV_SIMD_SCALABLE
+ R setall_res1 = v_setall((LaneType)5);
+ R setall_res2 = v_setall<LaneType>(6);
#else
#error "Configuration error"
#endif
#if CV_SIMD_WIDTH > 0
Data<R> setall_res1_; v_store(setall_res1_.d, setall_res1);
Data<R> setall_res2_; v_store(setall_res2_.d, setall_res2);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((LaneType)5, setall_res1_[i]);
R vx_setall_res2 = vx_setall<LaneType>(12);
Data<R> vx_setall_res1_; v_store(vx_setall_res1_.d, vx_setall_res1);
Data<R> vx_setall_res2_; v_store(vx_setall_res2_.d, vx_setall_res2);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((LaneType)11, vx_setall_res1_[i]);
uint64 a = CV_BIG_INT(0x7fffffffffffffff);
uint64 b = (uint64)CV_BIG_INT(0xcfffffffffffffff);
v_uint64x2 uint64_vec(a, b);
- EXPECT_EQ(a, uint64_vec.get0());
+ EXPECT_EQ(a, v_get0(uint64_vec));
EXPECT_EQ(b, v_extract_n<1>(uint64_vec));
}
{
int64 a = CV_BIG_INT(0x7fffffffffffffff);
int64 b = CV_BIG_INT(-1);
v_int64x2 int64_vec(a, b);
- EXPECT_EQ(a, int64_vec.get0());
+ EXPECT_EQ(a, v_get0(int64_vec));
EXPECT_EQ(b, v_extract_n<1>(int64_vec));
}
#endif
R a = data1, b = data2, c = data3;
R d = data1, e = data2, f = data3, g = data4;
- LaneType buf3[R::nlanes * 3];
- LaneType buf4[R::nlanes * 4];
+ LaneType buf3[VTraits<R>::max_nlanes * 3];
+ LaneType buf4[VTraits<R>::max_nlanes * 4];
v_store_interleave(buf3, a, b, c);
v_store_interleave(buf4, d, e, f, g);
v_load_deinterleave(buf3, a, b, c);
v_load_deinterleave(buf4, d, e, f, g);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(data1, Data<R>(a));
R a = data1, b = data2;
- LaneType buf2[R::nlanes * 2];
+ LaneType buf2[VTraits<R>::max_nlanes * 2];
v_store_interleave(buf2, a, b);
v_load_deinterleave(buf2, a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(data1, Data<R>(a));
f = v_expand_high(a);
Data<Rx2> resC = c, resD = d, resE = e, resF = f;
- const int n = Rx2::nlanes;
+ const int n = VTraits<Rx2>::vlanes();
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
typedef typename V_RegTraits<R>::q_reg Rx4;
Data<R> data;
Data<Rx4> out = vx_load_expand_q(data.d);
- const int n = Rx4::nlanes;
+ const int n = VTraits<Rx4>::vlanes();
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
dataB.reverse();
R a = dataA, b = dataB;
- Data<R> resC = a + b, resD = a - b;
- for (int i = 0; i < R::nlanes; ++i)
+ Data<R> resC = v_add(a, b), resD = v_sub(a, b);
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
Data<R> resC = v_add_wrap(a, b),
resD = v_sub_wrap(a, b),
resE = v_mul_wrap(a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
dataB.reverse();
R a = dataA, b = dataB;
- Data<R> resC = a * b;
- for (int i = 0; i < R::nlanes; ++i)
+ Data<R> resC = v_mul(a, b);
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resC[i]);
dataB.reverse();
R a = dataA, b = dataB;
- Data<R> resC = a / b;
- for (int i = 0; i < R::nlanes; ++i)
+ Data<R> resC = v_div(a, b);
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
v_mul_expand(a, b, c, d);
Data<Rx2> resC = c, resD = d;
- const int n = R::nlanes / 2;
+ const int n = VTraits<R>::vlanes() / 2;
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
- EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
+ EXPECT_EQ((typename VTraits<Rx2>::lane_type)dataA[i] * dataB[i], resC[i]);
+ EXPECT_EQ((typename VTraits<Rx2>::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
}
return *this;
R c = v_mul_hi(a, b);
Data<R> resC = c;
- const int n = R::nlanes / 2;
+ const int n = VTraits<R>::vlanes() / 2;
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((typename R::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
+ EXPECT_EQ((typename VTraits<R>::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
}
return *this;
TheTest & test_abs()
{
typedef typename V_RegTraits<R>::u_reg Ru;
- typedef typename Ru::lane_type u_type;
+ typedef typename VTraits<Ru>::lane_type u_type;
+ typedef typename VTraits<R>::lane_type R_type;
Data<R> dataA, dataB(10);
R a = dataA, b = dataB;
- a = a - b;
+ a = v_sub(a, b);
Data<Ru> resC = v_abs(a);
- for (int i = 0; i < Ru::nlanes; ++i)
+ for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]);
+ R_type ssub = dataA[i] - dataB[i] < std::numeric_limits<R_type>::min() ? std::numeric_limits<R_type>::min() : dataA[i] - dataB[i];
+ EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
}
return *this;
dataA[0] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
R a = dataA;
- Data<R> resB = a << s, resC = v_shl<s>(a), resD = a >> s, resE = v_shr<s>(a);
+ Data<R> resB = v_shl<s>(a), resC = v_shl<s>(a), resD = v_shr<s>(a), resE = v_shr<s>(a);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(static_cast<LaneType>(dataA[i] << s), resB[i]);
dataB += 1;
R a = dataA, b = dataB;
- Data<R> resC = (a == b);
- Data<R> resD = (a != b);
- Data<R> resE = (a > b);
- Data<R> resF = (a >= b);
- Data<R> resG = (a < b);
- Data<R> resH = (a <= b);
+ Data<R> resC = (v_eq(a, b));
+ Data<R> resD = (v_ne(a, b));
+ Data<R> resE = (v_gt(a, b));
+ Data<R> resF = (v_ge(a, b));
+ Data<R> resG = (v_lt(a, b));
+ Data<R> resH = (v_le(a, b));
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
TheTest & test_dotprod()
{
typedef typename V_RegTraits<R>::w_reg Rx2;
- typedef typename Rx2::lane_type w_type;
+ typedef typename VTraits<Rx2>::lane_type w_type;
Data<R> dataA, dataB;
- dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
- dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
+ dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
+ dataB += std::numeric_limits<LaneType>::min() + VTraits<R>::vlanes();
R a = dataA, b = dataB;
Data<Rx2> dataC;
dataC += std::numeric_limits<w_type>::is_signed ?
std::numeric_limits<w_type>::min() :
- std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
+ std::numeric_limits<w_type>::max() - VTraits<R>::vlanes() * (dataB[0] + 1);
Rx2 c = dataC;
Data<Rx2> resD = v_dotprod(a, b),
resE = v_dotprod(a, b, c);
- const int n = R::nlanes / 2;
+ const int n = VTraits<R>::vlanes() / 2;
w_type sumAB = 0, sumABC = 0, tmp_sum;
for (int i = 0; i < n; ++i)
{
TheTest & test_dotprod_expand()
{
typedef typename V_RegTraits<R>::q_reg Rx4;
- typedef typename Rx4::lane_type l4_type;
+ typedef typename VTraits<Rx4>::lane_type l4_type;
Data<R> dataA, dataB;
- dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
- dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
+ dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
+ dataB += std::numeric_limits<LaneType>::min() + VTraits<R>::vlanes();
R a = dataA, b = dataB;
Data<Rx4> dataC;
resE = v_dotprod_expand(a, b, c);
l4_type sumAB = 0, sumABC = 0, tmp_sum;
- for (int i = 0; i < Rx4::nlanes; ++i)
+ for (int i = 0; i < VTraits<Rx4>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
tmp_sum = (l4_type)dataA[i*4] * (l4_type)dataB[i*4] +
{
#if CV_SIMD_64F
Data<R> dataA, dataB;
- dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+ dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
dataB += std::numeric_limits<LaneType>::min();
R a = dataA, b = dataB;
resB = v_dotprod_expand(b, b),
resC = v_dotprod_expand(a, b, c);
- const int n = R::nlanes / 2;
+ const int n = VTraits<R>::vlanes() / 2;
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
Data<R> dataA, dataB(2);
R a = dataA, b = dataB;
- Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
- for (int i = 0; i < R::nlanes; ++i)
+ Data<R> resC = v_and(a, b), resD = v_or(a, b), resE = v_xor(a, b), resF = v_not(a);
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
R a = dataA, d = dataD;
Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_COMPARE_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
R a = dataA, b = dataB;
Data<R> resC = v_min(a, b), resD = v_max(a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
R a = dataA;
Data<Ru> resB = v_popcount(a);
- for (int i = 0; i < Ru::nlanes; ++i)
+ for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
EXPECT_EQ(popcountTable[i + 1], resB[i]);
return *this;
TheTest & test_absdiff()
{
typedef typename V_RegTraits<R>::u_reg Ru;
- typedef typename Ru::lane_type u_type;
+ typedef typename VTraits<Ru>::lane_type u_type;
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
dataA[0] = (LaneType)-1;
R a = dataA, b = dataB;
Data<Ru> resC = v_absdiff(a, b);
const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
- for (int i = 0; i < Ru::nlanes; ++i)
+ for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
u_type uA = dataA[i] ^ mask;
dataB[1] = -2;
R a = dataA, b = dataB;
Data<R> resC = v_absdiff(a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
dataB[1] = (LaneType)-2;
R a = dataA, b = dataB;
Data<R> resC = v_absdiffs(a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
}
TheTest & test_reduce()
{
Data<R> dataA;
+ LaneType min = VTraits<R>::vlanes(), max = 0;
int sum = 0;
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
+ min = std::min<LaneType>(min, dataA[i]);
+ max = std::max<LaneType>(max, dataA[i]);
sum += (int)(dataA[i]); // To prevent a constant overflow with int8
}
R a = dataA;
- EXPECT_EQ((LaneType)1, (LaneType)v_reduce_min(a));
- EXPECT_EQ((LaneType)(R::nlanes), (LaneType)v_reduce_max(a));
+ EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(a));
+ EXPECT_EQ((LaneType)max, (LaneType)v_reduce_max(a));
EXPECT_EQ((int)(sum), (int)v_reduce_sum(a));
- dataA[0] += R::nlanes;
+ dataA[0] += VTraits<R>::vlanes();
R an = dataA;
- EXPECT_EQ((LaneType)2, (LaneType)v_reduce_min(an));
+ min = VTraits<R>::vlanes();
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+ {
+ min = std::min<LaneType>(min, dataA[i]);
+ }
+ EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(an));
return *this;
}
TheTest & test_reduce_sad()
{
- Data<R> dataA, dataB(R::nlanes/2);
+ Data<R> dataA, dataB(VTraits<R>::vlanes()/2);
R a = dataA;
R b = dataB;
- EXPECT_EQ((unsigned)(R::nlanes*R::nlanes/4), v_reduce_sad(a, b));
+ uint sum = 0;
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+ {
+ sum += std::abs(int(dataA[i] - dataB[i]));
+ }
+ EXPECT_EQ(sum, v_reduce_sad(a, b));
return *this;
}
{
typedef typename V_RegTraits<R>::int_reg int_reg;
typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
- typedef typename int_reg::lane_type int_type;
- typedef typename uint_reg::lane_type uint_type;
+ typedef typename VTraits<int_reg>::lane_type int_type;
+ typedef typename VTraits<uint_reg>::lane_type uint_type;
Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
dataA[1] *= (LaneType)-1;
all1s;
all1s.ui = (uint_type)-1;
LaneType mask_one = all1s.l;
- dataB[R::nlanes - 1] = mask_one;
+ dataB[VTraits<R>::vlanes() - 1] = mask_one;
R l = dataB;
dataB[1] = mask_one;
- dataB[R::nlanes / 2] = mask_one;
+ dataB[VTraits<R>::vlanes() / 2] = mask_one;
dataC *= (LaneType)-1;
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
- dataC[R::nlanes - 1] = 0;
+ dataC[VTraits<R>::vlanes() - 1] = 0;
R nl = dataC;
EXPECT_EQ(2, v_signmask(a));
-#if CV_SIMD_WIDTH <= 32
- EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b));
+#if (CV_SIMD_WIDTH <= 32) && (!CV_SIMD_SCALABLE)
+ EXPECT_EQ(2 | (1 << (VTraits<R>::vlanes() / 2)) | (1 << (VTraits<R>::vlanes() - 1)), v_signmask(b));
#endif
EXPECT_EQ(false, v_check_all(a));
EXPECT_EQ(true, v_check_any(l));
R f = v_select(b, d, e);
Data<R> resF = f;
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
int_type m2 = dataB.as_int(i);
{
SCOPED_TRACE(s);
typedef typename V_RegTraits<R>::w_reg Rx2;
- typedef typename Rx2::lane_type w_type;
+ typedef typename VTraits<Rx2>::lane_type w_type;
Data<Rx2> dataA, dataB;
dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
dataB *= 10;
Data<R> resF(0);
v_rshr_pack_store<s>(resF.d, b);
- const int n = Rx2::nlanes;
+ const int n = VTraits<Rx2>::vlanes();
const w_type add = (w_type)1 << (s - 1);
for (int i = 0; i < n; ++i)
{
//typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
typedef typename V_RegTraits<R>::w_reg R2;
typedef typename V_RegTraits<R2>::int_reg Ri2;
- typedef typename Ri2::lane_type w_type;
+ typedef typename VTraits<Ri2>::lane_type w_type;
Data<Ri2> dataA, dataB;
dataA += -10;
Data<R> resF(0);
v_rshr_pack_u_store<s>(resF.d, b);
- const int n = Ri2::nlanes;
+ const int n = VTraits<Ri2>::vlanes();
const w_type add = (w_type)1 << (s - 1);
for (int i = 0; i < n; ++i)
{
{
// 16-bit
Data<R> dataA, dataB;
- dataB.fill(0, R::nlanes / 2);
+ dataB.fill(0, VTraits<R>::vlanes() / 2);
R a = dataA, b = dataB;
- Data<R> maskA = a == b, maskB = a != b;
+ Data<R> maskA = v_eq(a, b), maskB = v_ne(a, b);
a = maskA; b = maskB;
Data<R> res = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
- for (int i = 0; i < v_uint16::nlanes; ++i)
+ for (int i = 0; i < VTraits<v_uint16>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(maskA[i * 2], res[i]);
- EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
+ EXPECT_EQ(maskB[i * 2], res[i + VTraits<v_uint16>::vlanes()]);
}
// 32-bit
Data<R> dataC, dataD;
- dataD.fill(0, R::nlanes / 2);
+ dataD.fill(0, VTraits<R>::vlanes() / 2);
R c = dataC, d = dataD;
- Data<R> maskC = c == d, maskD = c != d;
+ Data<R> maskC = v_eq(c, d), maskD = v_ne(c, d);
c = maskC; d = maskD;
res = v_pack_b
v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
);
- for (int i = 0; i < v_uint32::nlanes; ++i)
+ for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(maskA[i * 4], res[i]);
- EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
- EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
- EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
+ EXPECT_EQ(maskB[i * 4], res[i + VTraits<v_uint32>::vlanes()]);
+ EXPECT_EQ(maskC[i * 4], res[i + VTraits<v_uint32>::vlanes() * 2]);
+ EXPECT_EQ(maskD[i * 4], res[i + VTraits<v_uint32>::vlanes() * 3]);
}
// 64-bit
Data<R> dataE, dataF, dataG(0), dataH(0xFF);
- dataF.fill(0, R::nlanes / 2);
+ dataF.fill(0, VTraits<R>::vlanes() / 2);
R e = dataE, f = dataF, g = dataG, h = dataH;
- Data<R> maskE = e == f, maskF = e != f;
+ Data<R> maskE = v_eq(e, f), maskF = v_ne(e, f);
e = maskE; f = maskF;
res = v_pack_b
v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
);
- for (int i = 0; i < v_uint64::nlanes; ++i)
+ for (int i = 0; i < VTraits<v_uint64>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(maskA[i * 8], res[i]);
- EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
- EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
- EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
-
- EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
- EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
- EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
- EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
+ EXPECT_EQ(maskB[i * 8], res[i + VTraits<v_uint64>::vlanes()]);
+ EXPECT_EQ(maskC[i * 8], res[i + VTraits<v_uint64>::vlanes() * 2]);
+ EXPECT_EQ(maskD[i * 8], res[i + VTraits<v_uint64>::vlanes() * 3]);
+
+ EXPECT_EQ(maskE[i * 8], res[i + VTraits<v_uint64>::vlanes() * 4]);
+ EXPECT_EQ(maskF[i * 8], res[i + VTraits<v_uint64>::vlanes() * 5]);
+ EXPECT_EQ(dataG[i * 8], res[i + VTraits<v_uint64>::vlanes() * 6]);
+ EXPECT_EQ(dataH[i * 8], res[i + VTraits<v_uint64>::vlanes() * 7]);
}
return *this;
Data<R> resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi;
- const int n = R::nlanes/2;
+ const int n = VTraits<R>::vlanes()/2;
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
Data<R> resB = v_reverse(a);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]);
+ EXPECT_EQ(dataA[VTraits<R>::vlanes() - i - 1], resB[i]);
}
return *this;
Data<R> resC = v_extract<s>(a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- if (i + s >= R::nlanes)
- EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
+ if (i + s >= VTraits<R>::vlanes())
+ EXPECT_EQ(dataB[i - VTraits<R>::vlanes() + s], resC[i]);
else
EXPECT_EQ(dataA[i + s], resC[i]);
}
Data<R> resE = v_rotate_left<s>(a);
Data<R> resF = v_rotate_left<s>(a, b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- if (i + s >= R::nlanes)
+ if (i + s >= VTraits<R>::vlanes())
{
EXPECT_EQ((LaneType)0, resC[i]);
- EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
+ EXPECT_EQ(dataB[i - VTraits<R>::vlanes() + s], resD[i]);
- EXPECT_EQ((LaneType)0, resE[i - R::nlanes + s]);
- EXPECT_EQ(dataB[i], resF[i - R::nlanes + s]);
+ EXPECT_EQ((LaneType)0, resE[i - VTraits<R>::vlanes() + s]);
+ EXPECT_EQ(dataB[i], resF[i - VTraits<R>::vlanes() + s]);
}
else
{
return *this;
}
+ TheTest & test_extract_highest()
+ {
+ Data<R> dataA;
+ LaneType test_value = (LaneType)(VTraits<R>::vlanes()-1 + 50);
+ dataA[VTraits<R>::vlanes()-1] = test_value;
+ R a = dataA;
+
+ LaneType res = v_extract_highest(a);
+ EXPECT_EQ(test_value, res);
+
+ return *this;
+ }
+
template<int s>
TheTest & test_broadcast_element()
{
Data<R> res = v_broadcast_element<s>(a);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+ {
+ SCOPED_TRACE(i);
+ EXPECT_EQ(test_value, res[i]);
+ }
+ return *this;
+ }
+
+ TheTest & test_broadcast_highest()
+ {
+ Data<R> dataA;
+ LaneType test_value = (LaneType)(VTraits<R>::vlanes()-1 + 50);
+ dataA[VTraits<R>::vlanes()-1] = test_value;
+ R a = dataA;
+
+ Data<R> res = v_broadcast_highest(a);
+
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(i);
EXPECT_EQ(test_value, res[i]);
resG = v_sqr_magnitude(a1, a2),
resH = v_muladd(a1, a2, a3);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(cvRound(data1[i]), resB[i]);
- EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
+ EXPECT_EQ((typename VTraits<Ri>::lane_type)data1[i], resC[i]);
EXPECT_EQ(cvFloor(data1[i]), resD[i]);
EXPECT_EQ(cvCeil(data1[i]), resE[i]);
R a = dataA;
Rt b = v_cvt_f32(a);
Data<Rt> resB = b;
- int n = std::min<int>(Rt::nlanes, R::nlanes);
+ int n = std::min<int>(VTraits<Rt>::vlanes(), VTraits<R>::vlanes());
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
+ EXPECT_EQ((typename VTraits<Rt>::lane_type)dataA[i], resB[i]);
}
return *this;
}
Rt c = v_cvt_f64_high(a);
Data<Rt> resB = b;
Data<Rt> resC = c;
- int n = std::min<int>(Rt::nlanes, R::nlanes);
+ int n = std::min<int>(VTraits<Rt>::vlanes(), VTraits<R>::vlanes());
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
+ EXPECT_EQ((typename VTraits<Rt>::lane_type)dataA[i], resB[i]);
}
for (int i = 0; i < n; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
- EXPECT_EQ((typename Rt::lane_type)dataA[i+n], resC[i]);
+ EXPECT_EQ((typename VTraits<Rt>::lane_type)dataA[i+n], resC[i]);
}
#endif
return *this;
#if CV_SIMD_64F
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
- dataB += R::nlanes;
+ dataB += VTraits<R>::vlanes();
R a = dataA, b = dataB;
v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b);
Data<v_float64> resC = c;
Data<v_float64> resD = d;
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ((double)dataA[i], resC[i]);
R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD;
Data<R> res = v_matmul(v, a, b, c, d);
- for (int i = 0; i < R::nlanes; i += 4)
- {
+ // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+ // {
+ int i = 0;
for (int j = i; j < i + 4; ++j)
{
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
+ dataV[i + 3] * dataD[j];
EXPECT_COMPARE_EQ(val, res[j]);
}
- }
+ // }
Data<R> resAdd = v_matmuladd(v, a, b, c, d);
- for (int i = 0; i < R::nlanes; i += 4)
- {
+ // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+ // {
+ i = 0;
for (int j = i; j < i + 4; ++j)
{
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
+ dataD[j];
EXPECT_COMPARE_EQ(val, resAdd[j]);
}
- }
+ // }
return *this;
}
e, f, g, h);
Data<R> res[4] = {e, f, g, h};
- for (int i = 0; i < R::nlanes; i += 4)
- {
+ // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+ // {
+ int i = 0;
for (int j = 0; j < 4; ++j)
{
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
EXPECT_EQ(dataC[i + j], res[j][i + 2]);
EXPECT_EQ(dataD[i + j], res[j][i + 3]);
}
- }
+ // }
return *this;
}
R a = dataA, b = dataB, c = dataC, d = dataD;
Data<R> res = v_reduce_sum4(a, b, c, d);
- for (int i = 0; i < R::nlanes; i += 4)
- {
+ // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
+ // {
+ int i = 0;
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]);
EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]);
EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]);
EXPECT_COMPARE_EQ(dataD.sum(i, 4), res[i + 3]);
- }
+ // }
return *this;
}
printf("test_loadstore_fp16_f32 ...\n");
AlignedData<v_uint16> data; data.a.clear();
data.a.d[0] = 0x3c00; // 1.0
- data.a.d[R::nlanes - 1] = (unsigned short)0xc000; // -2.0
+ data.a.d[VTraits<R>::vlanes() - 1] = (unsigned short)0xc000; // -2.0
AlignedData<v_float32> data_f32; data_f32.a.clear();
AlignedData<v_uint16> out;
R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
R r2(r1);
- EXPECT_EQ(1.0f, r1.get0());
+ EXPECT_EQ(1.0f, v_get0(r1));
v_store(data_f32.a.d, r2);
- EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
+ EXPECT_EQ(-2.0f, data_f32.a.d[VTraits<R>::vlanes() - 1]);
out.a.clear();
v_pack_store((cv::float16_t*)out.a.d, r2);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
}
AlignedData<R> out;
// check if addresses are aligned and unaligned respectively
- EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH);
- EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH);
- EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH);
- EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH);
+ EXPECT_EQ((size_t)0, (size_t)&data.a.d % VTraits<R>::max_nlanes);
+ EXPECT_NE((size_t)0, (size_t)&data.u.d % VTraits<R>::max_nlanes);
+ EXPECT_EQ((size_t)0, (size_t)&out.a.d % VTraits<R>::max_nlanes);
+ EXPECT_NE((size_t)0, (size_t)&out.u.d % VTraits<R>::max_nlanes);
// check some initialization methods
R r1 = data.u;
R r2 = vx_load_expand((const float16_t*)data.a.d);
R r3(r2);
- EXPECT_EQ(data.u[0], r1.get0());
- EXPECT_EQ(data.a[0], r2.get0());
- EXPECT_EQ(data.a[0], r3.get0());
+ EXPECT_EQ(data.u[0], v_get0(r1));
+ EXPECT_EQ(data.a[0], v_get0(r2));
+ EXPECT_EQ(data.a[0], v_get0(r3));
// check some store methods
out.a.clear();
v_float32 r1 = vx_load(data.a.d);
v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
v_float32 r3 = v_cvt_f32(r2);
- EXPECT_EQ(0x3c00, r2.get0());
- EXPECT_EQ(r3.get0(), r1.get0());
+ EXPECT_EQ(0x3c00, v_get0(r2));
+ EXPECT_EQ(v_get0(r3), v_get0(r1));
return *this;
}
Data<R> dataA, dataB;
R a = dataA, b = dataB;
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
dataA[i] = dataB[i];
}
Data<R> resC = (a == b);
Data<R> resD = (a != b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
}
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
dataA[i] = dataB[i] = (LaneType)-1;
}
resC = (a == b);
resD = (a != b);
- for (int i = 0; i < R::nlanes; ++i)
+ for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
};
-#if 1
+#if CV_SIMD
#define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*(int)sizeof(v_uint8), CV__TRACE_FUNCTION);
-#endif
-
//============= 8-bit integer =====================================================================
void test_hal_intrin_uint8()
std::cout << "SKIP: CV_FP16 is not available" << std::endl;
#endif
}
+#elif CV_SIMD_SCALABLE //Temporary
+#define DUMP_ENTRY(type) printf("SIMD: %s\n", CV__TRACE_FUNCTION);
+
+
+//============= 8-bit integer =====================================================================
+
+void test_hal_intrin_uint8()
+{
+ DUMP_ENTRY(v_uint8);
+ // typedef v_uint8 R;
+ TheTest<v_uint8>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+void test_hal_intrin_int8()
+{
+ DUMP_ENTRY(v_int8);
+ // typedef v_int8 R;
+ TheTest<v_int8>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+//============= 16-bit integer =====================================================================
+
+void test_hal_intrin_uint16()
+{
+ DUMP_ENTRY(v_uint16);
+ // typedef v_uint16 R;
+ TheTest<v_uint16>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+void test_hal_intrin_int16()
+{
+ DUMP_ENTRY(v_int16);
+ // typedef v_int16 R;
+ TheTest<v_int16>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+//============= 32-bit integer =====================================================================
+
+void test_hal_intrin_uint32()
+{
+ DUMP_ENTRY(v_uint32);
+ // typedef v_uint32 R;
+ TheTest<v_uint32>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+void test_hal_intrin_int32()
+{
+ DUMP_ENTRY(v_int32);
+ // typedef v_int32 R;
+ TheTest<v_int32>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+//============= 64-bit integer =====================================================================
+
+void test_hal_intrin_uint64()
+{
+ DUMP_ENTRY(v_uint64);
+ // typedef v_uint64 R;
+ TheTest<v_uint64>()
+ .test_loadstore()
+ ;
+}
+
+void test_hal_intrin_int64()
+{
+ DUMP_ENTRY(v_int64);
+ // typedef v_int64 R;
+ TheTest<v_int64>()
+ .test_loadstore()
+ ;
+}
+
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+ DUMP_ENTRY(v_float32);
+ // typedef v_float32 R;
+ TheTest<v_float32>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+}
+
+void test_hal_intrin_float64()
+{
+ DUMP_ENTRY(v_float64);
+#if CV_SIMD_64F
+ // typedef v_float64 R;
+ TheTest<v_float64>()
+ .test_loadstore()
+ .test_min_max()
+ ;
+
+#endif
+}
+
+#endif
+
/*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16
void test_hal_intrin_float16()
#if !defined(GAPI_STANDALONE)
+#include <opencv2/core/hal/intrin.hpp>
+#if CV_SIMD
#include "gfluidcore_func.hpp"
#include "gfluidcore_func.simd.hpp"
#include "gfluidutils.hpp"
#include <opencv2/core/cvdef.h>
-#include <opencv2/core/hal/intrin.hpp>
#include <cmath>
#include <cstdlib>
} // namespace fluid
} // namespace gapi
} // namespace cv
-
+#endif // CV_SIMD
#endif // !defined(GAPI_STANDALONE)
#pragma once
-#if !defined(GAPI_STANDALONE)
+#if !defined(GAPI_STANDALONE) && CV_SIMD
#include <opencv2/core.hpp>
}
}
+#if CV_SIMD
inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
{
z[k] = v_max(zero, v_min(base2, z[k]));
}
}
+#endif
void operator()(const uchar* src, uchar* dst, int n) const
{
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
}
+#if CV_SIMD
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
v_int32 (&ruv)[4],
v_int32 (&guv)[4],
buv[k] = vshift + ub * uu[k];
}
}
+#endif
static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv,
uchar& r, uchar& g, uchar& b, uchar& a)
a = uchar(0xff);
}
+#if CV_SIMD
static inline void yRGBuvToRGBA(const v_uint8& vy,
const v_int32 (&ruv)[4],
const v_int32 (&guv)[4],
gg = v_pack_u(g0, g1);
bb = v_pack_u(b0, b1);
}
+#endif
template<int bIdx, int dcn, bool is420>
static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v,
return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
}
+#if CV_SIMD
static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
{
const int shifted16 = (16 << ITUR_BT_601_SHIFT);
return v_pack(y0, y1);
}
+#endif
static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
{
v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
}
+#if CV_SIMD
static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
{
u = v_pack_u(u0, u1);
v = v_pack_u(v0, v1);
}
+#endif
struct RGB8toYUV420pInvoker: public ParallelLoopBody
{
typedef uchar value_type;
typedef int arg_type;
- enum { SIZE = 1 };
arg_type load(const uchar* ptr) { return *ptr; }
void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
void operator()(arg_type& a, arg_type& b) const
{
typedef ushort value_type;
typedef int arg_type;
- enum { SIZE = 1 };
arg_type load(const ushort* ptr) { return *ptr; }
void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
void operator()(arg_type& a, arg_type& b) const
{
typedef short value_type;
typedef int arg_type;
- enum { SIZE = 1 };
arg_type load(const short* ptr) { return *ptr; }
void store(short* ptr, arg_type val) { *ptr = (short)val; }
void operator()(arg_type& a, arg_type& b) const
{
typedef float value_type;
typedef float arg_type;
- enum { SIZE = 1 };
arg_type load(const float* ptr) { return *ptr; }
void store(float* ptr, arg_type val) { *ptr = val; }
void operator()(arg_type& a, arg_type& b) const
}
};
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
struct MinMaxVec8u
{
typedef uchar value_type;
- typedef v_uint8x16 arg_type;
- enum { SIZE = v_uint8x16::nlanes };
- arg_type load(const uchar* ptr) { return v_load(ptr); }
+ typedef v_uint8 arg_type;
+ arg_type load(const uchar* ptr) { return vx_load(ptr); }
void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const
{
a = v_min(a, b);
b = v_max(b, t);
}
-#if CV_SIMD_WIDTH > 16
- typedef v_uint8 warg_type;
- enum { WSIZE = v_uint8::nlanes };
- warg_type wload(const uchar* ptr) { return vx_load(ptr); }
- void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); }
- void operator()(warg_type& a, warg_type& b) const
- {
- warg_type t = a;
- a = v_min(a, b);
- b = v_max(b, t);
- }
-#endif
};
struct MinMaxVec16u
{
typedef ushort value_type;
- typedef v_uint16x8 arg_type;
- enum { SIZE = v_uint16x8::nlanes };
- arg_type load(const ushort* ptr) { return v_load(ptr); }
+ typedef v_uint16 arg_type;
+ arg_type load(const ushort* ptr) { return vx_load(ptr); }
void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const
{
a = v_min(a, b);
b = v_max(b, t);
}
-#if CV_SIMD_WIDTH > 16
- typedef v_uint16 warg_type;
- enum { WSIZE = v_uint16::nlanes };
- warg_type wload(const ushort* ptr) { return vx_load(ptr); }
- void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); }
- void operator()(warg_type& a, warg_type& b) const
- {
- warg_type t = a;
- a = v_min(a, b);
- b = v_max(b, t);
- }
-#endif
};
struct MinMaxVec16s
{
typedef short value_type;
- typedef v_int16x8 arg_type;
- enum { SIZE = v_int16x8::nlanes };
- arg_type load(const short* ptr) { return v_load(ptr); }
+ typedef v_int16 arg_type;
+ arg_type load(const short* ptr) { return vx_load(ptr); }
void store(short* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const
{
a = v_min(a, b);
b = v_max(b, t);
}
-#if CV_SIMD_WIDTH > 16
- typedef v_int16 warg_type;
- enum { WSIZE = v_int16::nlanes };
- warg_type wload(const short* ptr) { return vx_load(ptr); }
- void store(short* ptr, const warg_type &val) { v_store(ptr, val); }
- void operator()(warg_type& a, warg_type& b) const
- {
- warg_type t = a;
- a = v_min(a, b);
- b = v_max(b, t);
- }
-#endif
};
struct MinMaxVec32f
{
typedef float value_type;
- typedef v_float32x4 arg_type;
- enum { SIZE = v_float32x4::nlanes };
- arg_type load(const float* ptr) { return v_load(ptr); }
+ typedef v_float32 arg_type;
+ arg_type load(const float* ptr) { return vx_load(ptr); }
void store(float* ptr, const arg_type &val) { v_store(ptr, val); }
void operator()(arg_type& a, arg_type& b) const
{
a = v_min(a, b);
b = v_max(b, t);
}
-#if CV_SIMD_WIDTH > 16
- typedef v_float32 warg_type;
- enum { WSIZE = v_float32::nlanes };
- warg_type wload(const float* ptr) { return vx_load(ptr); }
- void store(float* ptr, const warg_type &val) { v_store(ptr, val); }
- void operator()(warg_type& a, warg_type& b) const
- {
- warg_type t = a;
- a = v_min(a, b);
- b = v_max(b, t);
- }
-#endif
};
#else
typedef typename Op::value_type T;
typedef typename Op::arg_type WT;
typedef typename VecOp::arg_type VT;
-#if CV_SIMD_WIDTH > 16
- typedef typename VecOp::warg_type WVT;
-#endif
const T* src = _src.ptr<T>();
T* dst = _dst.ptr<T>();
if( limit == size.width )
break;
-#if CV_SIMD_WIDTH > 16
- for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE )
- {
- WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn);
- WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn);
- WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn);
-
- vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
- vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
- vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
- vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
- vop(p4, p2); vop(p6, p4); vop(p4, p2);
- vop.store(dst+j, p4);
- }
+#if CV_SIMD || CV_SIMD_SCALABLE
+ int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
+#else
+ int nlanes = 1;
#endif
- for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
+ for( ; j <= size.width - nlanes - cn; j += nlanes )
{
VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
if( limit == size.width )
break;
-#if CV_SIMD_WIDTH > 16
- for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE )
- {
- WVT p[25];
- for( k = 0; k < 5; k++ )
- {
- const T* rowk = row[k];
- p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn);
- p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn);
- p[k*5+4] = vop.wload(rowk+j+cn*2);
- }
-
- vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
- vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
- vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
- vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
- vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
- vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
- vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
- vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
- vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
- vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
- vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
- vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
- vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
- vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
- vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
- vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
- vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
- vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
- vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
- vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
- vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
- vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
- vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
- vop.store(dst+j, p[12]);
- }
+#if CV_SIMD || CV_SIMD_SCALABLE
+ int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
+#else
+ int nlanes = 1;
#endif
- for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
+ for( ; j <= size.width - nlanes - cn*2; j += nlanes )
{
- VT p[25];
- for( k = 0; k < 5; k++ )
- {
- const T* rowk = row[k];
- p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
- p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
- p[k*5+4] = vop.load(rowk+j+cn*2);
- }
-
- vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
- vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
- vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
- vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
- vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
- vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
- vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
- vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
- vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
- vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
- vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
- vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
- vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
- vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
- vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
- vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
- vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
- vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
- vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
- vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
- vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
- vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
- vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
- vop.store(dst+j, p[12]);
+ VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2);
+ VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1);
+ VT p2 = vop.load(row[0]+j-cn*0), p7 = vop.load(row[1]+j-cn*0), p12 = vop.load(row[2]+j-cn*0), p17 = vop.load(row[3]+j-cn*0), p22 = vop.load(row[4]+j-cn*0);
+ VT p3 = vop.load(row[0]+j+cn*1), p8 = vop.load(row[1]+j+cn*1), p13 = vop.load(row[2]+j+cn*1), p18 = vop.load(row[3]+j+cn*1), p23 = vop.load(row[4]+j+cn*1);
+ VT p4 = vop.load(row[0]+j+cn*2), p9 = vop.load(row[1]+j+cn*2), p14 = vop.load(row[2]+j+cn*2), p19 = vop.load(row[3]+j+cn*2), p24 = vop.load(row[4]+j+cn*2);
+
+ vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
+ vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
+ vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
+ vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
+ vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
+ vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
+ vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
+ vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
+ vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
+ vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
+ vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
+ vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
+ vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
+ vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
+ vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
+ vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
+ vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
+ vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
+ vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
+ vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
+ vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
+ vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
+ vop(p7, p11); vop(p11, p13); vop(p11, p12);
+ vop.store(dst+j, p12);
}
limit = size.width;
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+OPTION(RISCV_RVV_SCALABLE "Use scalable RVV API on RISC-V" ON) # Enabled by default
+IF(RISCV_RVV_SCALABLE)
+ ADD_DEFINITIONS(-DCV_RVV_SCALABLE)
+ENDIF()
set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)