From 5f637e5a020e5e15e52c5c2be5393175a5b8b0fb Mon Sep 17 00:00:00 2001 From: damonyu1989 <71262988+damonyu1989@users.noreply.github.com> Date: Wed, 26 May 2021 01:15:12 +0800 Subject: [PATCH] Merge pull request #19778 from damonyu1989:master-riscv-0.7.1 * Add the support for riscv64 vector 0.7.1. * fixed GCC warnings * cleaned whitespaces * Remove the worning by the use of internal API of compiler. * Update the license header. * removed trailing whitespaces Co-authored-by: Vadim Pisarevsky Co-authored-by: yulj Co-authored-by: Vadim Pisarevsky --- .../core/include/opencv2/core/cv_cpu_dispatch.h | 9 + modules/core/include/opencv2/core/cvdef.h | 4 + modules/core/include/opencv2/core/hal/intrin.hpp | 6 +- .../include/opencv2/core/hal/intrin_rvv071.hpp | 2545 ++++++++++++++++++++ modules/core/src/arithm.simd.hpp | 2 +- modules/core/src/system.cpp | 4 + platforms/linux/riscv64-071-gcc.toolchain.cmake | 9 + 7 files changed, 2577 insertions(+), 2 deletions(-) create mode 100644 modules/core/include/opencv2/core/hal/intrin_rvv071.hpp create mode 100644 platforms/linux/riscv64-071-gcc.toolchain.cmake diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index fe15e51..f3437b5 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -142,6 +142,11 @@ # define CV_NEON 1 #endif +#if defined(__riscv) && defined(__riscv_vector) +# include +# define CV_RVV071 1 +#endif + #if defined(__ARM_NEON__) || defined(__aarch64__) # include #endif @@ -338,6 +343,10 @@ struct VZeroUpperGuard { # define CV_NEON 0 #endif +#ifndef CV_RVV071 +# define CV_RVV071 0 +#endif + #ifndef CV_VSX # define CV_VSX 0 #endif diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index ebb1f65..7f51074 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -271,6 +271,8 @@ namespace cv { #define CV_CPU_MSA 150 +#define CV_CPU_RISCVV 170 + #define CV_CPU_VSX 200 #define CV_CPU_VSX3 201 @@ -325,6 +327,8 @@ enum CpuFeatures { CPU_MSA = 150, + CPU_RISCVV = 170, + CPU_VSX = 200, CPU_VSX3 = 201, diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 6f5b8e1..ac331f2 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -200,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_RVV #endif -#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP) +#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP) #define CV__SIMD_FORWARD 128 #include "opencv2/core/hal/intrin_forward.hpp" #endif @@ -214,6 +214,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #include "opencv2/core/hal/intrin_neon.hpp" +#elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP) +#define CV_SIMD128_CPP 0 +#include "opencv2/core/hal/intrin_rvv071.hpp" + #elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP) #include "opencv2/core/hal/intrin_vsx.hpp" diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp new file mode 100644 index 0000000..2bdc622 --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp @@ -0,0 +1,2545 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +// Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved. + +#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP +#define OPENCV_HAL_INTRIN_RISCVV_HPP + +#include +#include +#include "opencv2/core/utility.hpp" + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 +//////////// Types //////////// +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + + v_uint8x16() {} + explicit v_uint8x16(vuint8m1_t v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16); + } + uchar get0() const + { + return vmv_x_s_u8m1_u8(val, 16); + } + + vuint8m1_t val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(vint8m1_t v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = (vint8m1_t)vle_v_i8m1((schar*)v, 16); + } + schar get0() const + { + return vmv_x_s_i8m1_i8(val, 16); + } + + vint8m1_t val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(vuint16m1_t v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8); + } + ushort get0() const + { + return vmv_x_s_u16m1_u16(val, 8); + } + + vuint16m1_t val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(vint16m1_t v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8); + } + short get0() const + { + return vmv_x_s_i16m1_i16(val, 8); + } + + vint16m1_t val; +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(vuint32m1_t v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + unsigned v[] = {v0, v1, v2, v3}; + val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4); + } + unsigned get0() const + { + return vmv_x_s_u32m1_u32(val, 4); + } + + vuint32m1_t val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(vint32m1_t v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4); + } + int get0() const + { + return vmv_x_s_i32m1_i32(val, 4); + } + vint32m1_t val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + + v_float32x4() {} + explicit v_float32x4(vfloat32m1_t v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4); + } + float get0() const + { + return vfmv_f_s_f32m1_f32(val, 4); + } + vfloat32m1_t val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + + v_uint64x2() {} + explicit v_uint64x2(vuint64m1_t v) : val(v) {} + v_uint64x2(uint64 v0, uint64 v1) + { + uint64 v[] = {v0, v1}; + val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2); + } + uint64 get0() const + { + return vmv_x_s_u64m1_u64(val, 2); + } + vuint64m1_t val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + + v_int64x2() {} + explicit v_int64x2(vint64m1_t v) : val(v) {} + v_int64x2(int64 v0, int64 v1) + { + int64 v[] = {v0, v1}; + val = (vint64m1_t)vle_v_i64m1((long*)v, 2); + } + int64 get0() const + { + return vmv_x_s_i64m1_i64(val, 2); + } + vint64m1_t val; +}; + +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2 }; + + v_float64x2() {} + explicit v_float64x2(vfloat64m1_t v) : val(v) {} + v_float64x2(double v0, double v1) + { + double v[] = {v0, v1}; + val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2); + } + double get0() const + { + return vfmv_f_s_f64m1_f64(val, 2); + } + vfloat64m1_t val; +}; + +#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \ +inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \ +inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \ +inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \ +inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \ +inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \ +inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \ +inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \ +inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \ +inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \ +inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\ +inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); } + + +OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8) +OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8) +OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16) +OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16) +OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32) +OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32) +OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64) +OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64) +OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64) +OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32) +#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \ +inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); } \ +inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } + +OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2) +OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2) +inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); } +inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); } + +inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); } +inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); } + + +#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val); \ + return a; \ +} + +#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val, num)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val, num); \ + return a; \ +} + +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4) +inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4)); +} +inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) +{ + a.val = vfdiv_vv_f32m1(a.val, b.val, 4); + return a; +} + +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2) +inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b) +{ + return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2)); +} +inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b) +{ + a.val = vfdiv_vv_f64m1(a.val, b.val, 2); + return a; +} +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val, num)); \ +} +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2) + +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ + return v_float32x4(vfsqrt_v_f32m1(x.val, 4)); +} + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4)); +} + +inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4)); + return v_sqrt(x); +} + +inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4)); +} + +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4)); +} + +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4)); +} + +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0); + res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + return v_float32x4(res); +} + +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& a) +{ + vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0); + res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1); + return v_float32x4(res); +} + +inline v_float64x2 v_sqrt(const v_float64x2& x) +{ + return v_float64x2(vfsqrt_v_f64m1(x.val, 2)); +} + +inline v_float64x2 v_invsqrt(const v_float64x2& x) +{ + return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2)); +} + +inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2)); + return v_sqrt(x); +} + +inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2)); +} + +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2)); +} + +inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_fma(a, b, c); +} + +#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \ + OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \ + inline _Tpvec operator ~ (const _Tpvec & a) \ + { \ + return _Tpvec(vnot_v_##suffix(a.val, num)); \ + } + +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16, i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8, i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4) +OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2) + +#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \ +inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +{ \ + return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \ +} \ +inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ +{ \ + a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \ + return a; \ +} + +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1) +OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1) + +inline v_float32x4 operator ~ (const v_float32x4& a) +{ + return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4))); +} + +#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \ +inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +{ \ + return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \ +} \ +inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ +{ \ + a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \ + return a; \ +} + +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1) +OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1) + +inline v_float64x2 operator ~ (const v_float64x2& a) +{ + return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2))); +} +inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) +{ + return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8)); +} +inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8)); +} + +//#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \ +//inline _Tpuvec v_abs(const _Tpsvec& a) { \ +// E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4); + +//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8) +//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16) +//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32) + +inline v_uint32x4 v_abs(v_int32x4 x) +{ + vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4); + return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)); +} + +inline v_uint16x8 v_abs(v_int16x8 x) +{ + vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8); + return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)); +} + +inline v_uint8x16 v_abs(v_int8x16 x) +{ + vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16); + return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)); +} + +inline v_float32x4 v_abs(v_float32x4 x) +{ + return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4); +} + +inline v_float64x2 v_abs(v_float64x2 x) +{ + return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2); +} + +inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) +{ + vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4); + return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4); +} + +inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) +{ + vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2); + return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2); +} + +#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \ +inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \ + vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \ + vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \ + return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\ +} + +OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16) +OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8) +OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4) + +/** Saturating absolute difference **/ +inline v_int8x16 v_absdiffs(v_int8x16 a, v_int8x16 b){ + vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16); + vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16); + return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16)); +} +inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){ + vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8); + vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8); + return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8)); +} + +#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \ +inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \ + vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\ + vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\ + return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num)); \ +} + +OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16) +OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8) +OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4) + +// Multiply and expand +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) +{ + vint16m2_t res = vundefined_i16m2(); + res = vwmul_vv_i16m2(a.val, b.val, 16); + c.val = vget_i16m2_i16m1(res, 0); + d.val = vget_i16m2_i16m1(res, 1); +} + +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) +{ + vuint16m2_t res = vundefined_u16m2(); + res = vwmulu_vv_u16m2(a.val, b.val, 16); + c.val = vget_u16m2_u16m1(res, 0); + d.val = vget_u16m2_u16m1(res, 1); +} + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + vint32m2_t res = vundefined_i32m2(); + res = vwmul_vv_i32m2(a.val, b.val, 8); + c.val = vget_i32m2_i32m1(res, 0); + d.val = vget_i32m2_i32m1(res, 1); +} + +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + vuint32m2_t res = vundefined_u32m2(); + res = vwmulu_vv_u32m2(a.val, b.val, 8); + c.val = vget_u32m2_u32m1(res, 0); + d.val = vget_u32m2_u32m1(res, 1); +} + +inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b, + v_int64x2& c, v_int64x2& d) +{ + vint64m2_t res = vundefined_i64m2(); + res = vwmul_vv_i64m2(a.val, b.val, 4); + c.val = vget_i64m2_i64m1(res, 0); + d.val = vget_i64m2_i64m1(res, 1); +} + +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + vuint64m2_t res = vundefined_u64m2(); + res = vwmulu_vv_u64m2(a.val, b.val, 4); + c.val = vget_u64m2_u64m1(res, 0); + d.val = vget_u64m2_u64m1(res, 1); +} + +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8) +OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8) +//////// Dot Product //////// +// 16 >> 32 +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + vint32m2_t res = vundefined_i32m2(); + res = vwmul_vv_i32m2(a.val, b.val, 8); + res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8); + return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4)); +} +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + vint32m2_t res = vundefined_i32m2(); + res = vwmul_vv_i32m2(a.val, b.val, 8); + res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4)); +} + +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ + vint64m2_t res = vundefined_i64m2(); + res = vwmul_vv_i64m2(a.val, b.val, 4); + res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4); + return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2)); +} +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + vint64m2_t res = vundefined_i64m2(); + res = vwmul_vv_i64m2(a.val, b.val, 4); + res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2)); +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + vuint16m2_t v1 = vundefined_u16m2(); + vuint32m2_t v2 = vundefined_u32m2(); + v1 = vwmulu_vv_u16m2(a.val, b.val, 16); + v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); + v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4)); +} + +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, + const v_uint32x4& c) +{ + vuint16m2_t v1 = vundefined_u16m2(); + vuint32m2_t v2 = vundefined_u32m2(); + v1 = vwmulu_vv_u16m2(a.val, b.val, 16); + v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); + v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4)); +} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + vint16m2_t v1 = vundefined_i16m2(); + vint32m2_t v2 = vundefined_i32m2(); + v1 = vwmul_vv_i16m2(a.val, b.val, 16); + v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); + v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4)); +} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, + const v_int32x4& c) +{ + vint16m2_t v1 = vundefined_i16m2(); + vint32m2_t v2 = vundefined_i32m2(); + v1 = vwmul_vv_i16m2(a.val, b.val, 16); + v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); + v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4)); +} + +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + vuint32m2_t v1 = vundefined_u32m2(); + vuint64m2_t v2 = vundefined_u64m2(); + v1 = vwmulu_vv_u32m2(a.val, b.val, 8); + v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); + v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2)); +} + +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, + const v_uint64x2& c) +{ + vuint32m2_t v1 = vundefined_u32m2(); + vuint64m2_t v2 = vundefined_u64m2(); + v1 = vwmulu_vv_u32m2(a.val, b.val, 8); + v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); + v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2)); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + vint32m2_t v1 = vundefined_i32m2(); + vint64m2_t v2 = vundefined_i64m2(); + v1 = vwmul_vv_i32m2(a.val, b.val, 8); + v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); + v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2)); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, + const v_int64x2& c) +{ + vint32m2_t v1 = vundefined_i32m2(); + vint64m2_t v2 = vundefined_i64m2(); + v1 = vwmul_vv_i32m2(a.val, b.val, 8); + v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8); + v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2)); +} + +//////// Fast Dot Product //////// +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ + vint32m2_t v1 = vundefined_i32m2(); + v1 = vwmul_vv_i32m2(a.val, b.val, 8); + return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4)); +} + +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + vint32m2_t v1 = vundefined_i32m2(); + v1 = vwmul_vv_i32m2(a.val, b.val, 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4)); +} + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ + vint64m2_t v1 = vundefined_i64m2(); + v1 = vwmul_vv_i64m2(a.val, b.val, 4); + return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2)); +} +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + vint64m2_t v1 = vundefined_i64m2(); + v1 = vwmul_vv_i64m2(a.val, b.val, 8); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4)); +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ + vuint16m2_t v1 = vundefined_u16m2(); + vuint32m2_t v2 = vundefined_u32m2(); + v1 = vwmulu_vv_u16m2(a.val, b.val, 16); + v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4)); +} + +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ + vuint16m2_t v1 = vundefined_u16m2(); + vuint32m2_t v2 = vundefined_u32m2(); + v1 = vwmulu_vv_u16m2(a.val, b.val, 16); + v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8); + return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4)); +} + +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ + vint16m2_t v1 = vundefined_i16m2(); + vint32m2_t v2 = vundefined_i32m2(); + v1 = vwmul_vv_i16m2(a.val, b.val, 16); + v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4)); +} +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ + vint16m2_t v1 = vundefined_i16m2(); + vint32m2_t v2 = vundefined_i32m2(); + v1 = vwmul_vv_i16m2(a.val, b.val, 16); + v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8); + return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4)); +} + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ + vuint32m2_t v1 = vundefined_u32m2(); + vuint64m2_t v2 = vundefined_u64m2(); + v1 = vwmulu_vv_u32m2(a.val, b.val, 8); + v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2)); +} +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ + vuint32m2_t v1 = vundefined_u32m2(); + vuint64m2_t v2 = vundefined_u64m2(); + v1 = vwmulu_vv_u32m2(a.val, b.val, 8); + v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4); + return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2)); +} + +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ + vint32m2_t v1 = vundefined_i32m2(); + vint64m2_t v2 = vundefined_i64m2(); + v1 = vwmul_vv_i32m2(a.val, b.val, 8); + v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2)); +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ + vint32m2_t v1 = vundefined_i32m2(); + vint64m2_t v2 = vundefined_i64m2(); + v1 = vwmul_vv_i32m2(a.val, b.val, 8); + v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4); + return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2)); +} + + +#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \ +inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \ +{\ + v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \ + val = intrin(val, a.val, val, num); \ + return vmv_x_s_##len##m1_##len(val, num); \ +} + + +#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \ +inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \ +{\ + v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \ + val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \ + return val[0]; \ +} +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4) +inline float v_reduce_sum(const v_float32x4& a) \ +{\ + vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \ + val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4); \ + return vfmv_f_s_f32m1_f32(val, 4); \ +} +inline double v_reduce_sum(const v_float64x2& a) \ +{\ + vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \ + val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2); \ + return vfmv_f_s_f64m1_f64(val, 2); \ +} +inline uint64 v_reduce_sum(const v_uint64x2& a) +{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); } + +inline int64 v_reduce_sum(const v_int64x2& a) +{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); } + +#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4) \ +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max) +OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min) + +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4); + vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4); + vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4); + vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4); + a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4); + b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4); + c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4); + d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4); + return v_float32x4(a0[0], b0[0], c0[0], d0[0]); +} + +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4); + vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4); + vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4); + vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4); + a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4); + return a0[0]; +} + +#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \ +inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \ + _Tpvec2 x = v_absdiff(a, b); \ + return v_reduce_sum(x); \ +} + +OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16) +OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16) +OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8) +OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8) +OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4) +OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4) + +#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \ + return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ +} \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \ + return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ +} \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \ + return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ +} \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \ + return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ +} \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \ + return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ +} \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ \ + vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \ + return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \ +} \ + +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1, 8, 16, _vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_) +OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_) + +//TODO: == +inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b) +{ + vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} +inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b) +{ + vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} +inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b) +{ + vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} +inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b) +{ + vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} +inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b) +{ + vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} +inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b) +{ + vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ + vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4); + vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4); + return v_float32x4((vfloat32m1_t)res); +} + +//TODO: == +inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b) +{ + vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b) +{ + vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b) +{ + vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b) +{ + vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b) +{ + vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b) +{ + vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ + vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2); + vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2); + return v_float64x2((vfloat64m1_t)res); +} +#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \ +inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \ + const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \ + v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \ + v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \ +{ \ + v##_Tp##32m4_t val = vundefined_##_T##m4(); \ + val = vset_##_T##m4(val, 0, a0.val); \ + val = vset_##_T##m4(val, 1, a1.val); \ + val = vset_##_T##m4(val, 2, a2.val); \ + val = vset_##_T##m4(val, 3, a3.val); \ + val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); \ + b0.val = vget_##_T##m4_##_T##m1(val, 0); \ + b1.val = vget_##_T##m4_##_T##m1(val, 1); \ + b2.val = vget_##_T##m4_##_T##m1(val, 2); \ + b3.val = vget_##_T##m4_##_T##m1(val, 3); \ +} +OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32) +OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32) +OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32) + + +#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } + +#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\ +template inline _Tpvec v_rshr(const _Tpvec& a) \ +{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); } + +// trade efficiency for convenience +#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \ +OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \ +OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin) + +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra) +OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra) + +#if 0 +#define VUP4(n) {0, 1, 2, 3} +#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7} +#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} +#define VUP2(n) {0, 1} +#endif +#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + suffix##m1_t tmp = vmv##_##_T##m1(0, num);\ + tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\ + return _Tpvec(tmp);\ +} \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ +{ return a; } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + suffix##m2_t tmp = vundefined_##_T##m2(); \ + tmp = vset_##_T##m2(tmp, 0, a.val); \ + tmp = vset_##_T##m2(tmp, 1, b.val); \ + tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\ + return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + suffix##m2_t tmp = vundefined_##_T##m2(); \ + tmp = vset_##_T##m2(tmp, 0, b.val); \ + tmp = vset_##_T##m2(tmp, 1, a.val); \ + tmp = vslideup_vx_##_T##m2(tmp, n, num2);\ + return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ +{ \ + CV_UNUSED(b); return a; \ +} + +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32) +OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64) + +#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \ + vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\ + return _Tpvec(_Tp2##_t(tmp)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(vle_v_##len(ptr, hnum)); }\ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(vle_v_##len(ptr, num)); } \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ vse_v_##len(ptr, a.val, hnum);}\ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num); \ + vse_v_##len(ptr, a0, hnum);}\ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ vse_v_##len(ptr, a.val, num); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ vse_v_##len(ptr, a.val, num); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ vse_v_##len(ptr, a.val, num); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ vse_v_##len(ptr, a.val, num); } + +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4) +OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2) + + +////////////// Lookup table access //////////////////// + +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ +#if 1 + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[ 0]], + tab[idx[ 1]], + tab[idx[ 2]], + tab[idx[ 3]], + tab[idx[ 4]], + tab[idx[ 5]], + tab[idx[ 6]], + tab[idx[ 7]], + tab[idx[ 8]], + tab[idx[ 9]], + tab[idx[10]], + tab[idx[11]], + tab[idx[12]], + tab[idx[13]], + tab[idx[14]], + tab[idx[15]] + }; + return v_int8x16(vle_v_i8m1(elems, 16)); +#else + int32xm4_t index32 = vlev_int32xm4(idx, 16); + vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16); + vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16); + return v_int8x16(vlxbv_i8m1(tab, index, 16)); +#endif +} + +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[4]], + tab[idx[4] + 1], + tab[idx[5]], + tab[idx[5] + 1], + tab[idx[6]], + tab[idx[6] + 1], + tab[idx[7]], + tab[idx[7] + 1] + }; + return v_int8x16(vle_v_i8m1(elems, 16)); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[2] + 2], + tab[idx[2] + 3], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[3] + 2], + tab[idx[3] + 3] + }; + return v_int8x16(vle_v_i8m1(elems, 16)); +} + +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]], + tab[idx[4]], + tab[idx[5]], + tab[idx[6]], + tab[idx[7]] + }; + return v_int16x8(vle_v_i16m1(elems, 8)); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1] + }; + return v_int16x8(vle_v_i16m1(elems, 8)); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3] + }; + return v_int16x8(vle_v_i16m1(elems, 8)); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_int32x4(vle_v_i32m1(elems, 4)); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1] + }; + return v_int32x4(vle_v_i32m1(elems, 4)); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(vle_v_i32m1(tab+idx[0], 4)); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + vint64m1_t res = {tab[idx[0]], tab[idx[1]]}; + return v_int64x2(res); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(vle_v_i64m1(tab+idx[0], 2)); +} + +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) +{ + vuint64m1_t res = {tab[idx[0]], tab[idx[1]]}; + return v_uint64x2(res); +} +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) +{ + return v_uint64x2(vle_v_u64m1(tab+idx[0], 2)); +} + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_float32x4(vle_v_f32m1(elems, 4)); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[0]+1], + tab[idx[1]], + tab[idx[1]+1] + }; + return v_float32x4(vle_v_f32m1(elems, 4)); +} +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) +{ + return v_float32x4(vle_v_f32m1(tab + idx[0], 4)); +} +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]}; + return v_float64x2(res); +} +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x2(vle_v_f64m1(tab+idx[0], 2)); +} + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idxvec.val[0]], + tab[idxvec.val[1]], + tab[idxvec.val[2]], + tab[idxvec.val[3]] + }; + return v_int32x4(vle_v_i32m1(elems, 4)); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + unsigned CV_DECL_ALIGNED(32) elems[4] = + { + tab[idxvec.val[0]], + tab[idxvec.val[1]], + tab[idxvec.val[2]], + tab[idxvec.val[3]] + }; + return v_uint32x4(vle_v_u32m1(elems, 4)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idxvec.val[0]], + tab[idxvec.val[1]], + tab[idxvec.val[2]], + tab[idxvec.val[3]] + }; + return v_float32x4(vle_v_f32m1(elems, 4)); +} +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]}; + return v_float64x2(res); +} +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4); + vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4); + + x.val = vlxe_v_f32m1(tab, index_x, 4); + y.val = vlxe_v_f32m1(tab, index_y, 4); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + x = v_float64x2(tab[idx[0]], tab[idx[1]]); + y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]); +} + +#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \ +inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \ +{ \ + v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ + tmp = vset_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_##_T2##m2(tmp, 1, b.val); \ + return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \ +}\ +template inline \ +v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \ +{ \ + v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ + tmp = vset_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_##_T2##m2(tmp, 1, b.val); \ + return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \ +}\ +inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \ +{ \ + v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ + tmp = vset_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \ + asm("" ::: "memory"); \ + vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \ +}\ +template inline \ +void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \ +{ \ + v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \ + tmp = vset_##_T2##m2(tmp, 0, a.val); \ + tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \ + vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \ +} +OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char) +OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short) +OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int) +OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char) +OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short) +OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int) + +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + vuint16m2_t tmp = vundefined_u16m2(); \ + tmp = vset_u16m2(tmp, 0, a.val); \ + tmp = vset_u16m2(tmp, 1, b.val); \ + return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16)); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + vuint32m4_t vabcd = vundefined_u32m4(); \ + vuint16m2_t v16 = vundefined_u16m2(); \ + vabcd = vset_u32m4(vabcd, 0, a.val); \ + vabcd = vset_u32m4(vabcd, 1, b.val); \ + vabcd = vset_u32m4(vabcd, 2, c.val); \ + vabcd = vset_u32m4(vabcd, 3, d.val); \ + v16 = vnsrl_vx_u16m2(vabcd, 0, 16); + return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16)); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + vuint64m8_t v64 = vundefined_u64m8(); \ + vuint32m4_t v32 = vundefined_u32m4(); \ + vuint16m2_t v16 = vundefined_u16m2(); \ + v64 = vset_u64m8(v64, 0, a.val); \ + v64 = vset_u64m8(v64, 1, b.val); \ + v64 = vset_u64m8(v64, 2, c.val); \ + v64 = vset_u64m8(v64, 3, d.val); \ + v64 = vset_u64m8(v64, 4, e.val); \ + v64 = vset_u64m8(v64, 5, f.val); \ + v64 = vset_u64m8(v64, 6, g.val); \ + v64 = vset_u64m8(v64, 7, h.val); \ + v32 = vnsrl_vx_u32m4(v64, 0, 16); + v16 = vnsrl_vx_u16m2(v32, 0, 16); + return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16)); +} + +//inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \ +//{ \ +// int16xm2_u tmp; \ +// tmp.m1[0] = (vint16m1_t)a.val; \ +// tmp.m1[1] = (vint16m1_t)b.val; \ +// e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\ +// return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16)); +//} + +#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \ +inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \ +{ \ + vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ + tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ + tmp = vset_##i##tp2##m2(tmp, 1, b.val); \ + vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\ + return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1)); \ +} \ +inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \ +{ \ + vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ + tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ + vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\ + return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2); \ +} \ +template inline \ +v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \ +{ \ + vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ + tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ + tmp = vset_##i##tp2##m2(tmp, 1, b.val); \ + vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\ + return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1)); \ +} \ +template inline \ +void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \ +{ \ + vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \ + tmp = vset_##i##tp2##m2(tmp, 0, a.val); \ + vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\ + vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1); \ + return vse_v_u##tp1##m1(ptr, val, num2);\ +} +OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char ) +OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short) + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + +// saturating multiply 8-bit, 16-bit +#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec) \ + inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ + { \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ + } \ + inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ + { a = a * b; return a; } + +OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, v_int32x4) +OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4) + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +static const signed char popCountTable[256] = +{ + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; + +inline vuint8m1_t vcnt_u8(vuint8m1_t val){ + vuint8m1_t v0 = val & 1; + return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0; +} + +inline v_uint8x16 +v_popcount(const v_uint8x16& a) +{ + return v_uint8x16(vcnt_u8(a.val)); +} + +inline v_uint8x16 +v_popcount(const v_int8x16& a) +{ + return v_uint8x16(vcnt_u8((vuint8m1_t)a.val)); +} + +inline v_uint16x8 +v_popcount(const v_uint16x8& a) +{ + vuint8m2_t tmp = vundefined_u8m2(); + tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); + vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0}; + tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ + vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8); + return v_uint16x8(vget_u16m2_u16m1(res, 0)); +} + +inline v_uint16x8 +v_popcount(const v_int16x8& a) +{ + vuint8m2_t tmp = vundefined_u8m2(); + tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); + vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0}; + tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ + vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8); + return v_uint16x8(vget_u16m2_u16m1(res, 0)); +} + +inline v_uint32x4 +v_popcount(const v_uint32x4& a) +{ + vuint8m2_t tmp = vundefined_u8m2(); + tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); + vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501, + 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703}; + tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ + vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16); + vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8); + return v_uint32x4(vget_u32m2_u32m1(res, 0)); +} + +inline v_uint32x4 +v_popcount(const v_int32x4& a) +{ + vuint8m2_t tmp = vundefined_u8m2(); + tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); + vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501, + 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703}; + tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ + vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16); + vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8); + return v_uint32x4(vget_u32m2_u32m1(res, 0)); +} + +inline v_uint64x2 +v_popcount(const v_uint64x2& a) +{ + vuint8m2_t tmp = vundefined_u8m2(); + tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); + vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000, + 0x0F0E0D0C0B0A0908, 0x0000000000000000}; + tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ + vuint8m1_t zero = vmv_v_x_u8m1(0, 16); + vuint8m1_t res1 = zero; + vuint8m1_t res2 = zero; + res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8); + res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8); + + return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8)); +} + +inline v_uint64x2 +v_popcount(const v_int64x2& a) +{ + vuint8m2_t tmp = vundefined_u8m2(); + tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val)); + vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000, + 0x0F0E0D0C0B0A0908, 0x0000000000000000}; + tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \ + vuint8m1_t zero = vmv_v_x_u8m1(0, 16); + vuint8m1_t res1 = zero; + vuint8m1_t res2 = zero; + res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8); + res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8); + + return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8)); +} + +#define SMASK 1, 2, 4, 8, 16, 32, 64, 128 +inline int v_signmask(const v_uint8x16& a) +{ + vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16); + vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK}; + vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16); + vuint32m1_t res = vmv_v_x_u32m1(0, 4); + vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8); + res = vredsum_vs_u32m2_u32m1(res, t2, res, 8); + res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8); + return vmv_x_s_u32m1_u32(res, 8); +} +inline int v_signmask(const v_int8x16& a) +{ + vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16); + vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK}; + vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16); + vint32m1_t res = vmv_v_x_i32m1(0, 4); + vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8); + res = vredsum_vs_i32m2_i32m1(res, t2, res, 8); + res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8); + return vmv_x_s_i32m1_i32(res, 8); +} + +inline int v_signmask(const v_int16x8& a) +{ + vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8); + vint16m1_t m1 = (vint16m1_t){SMASK}; + vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8); + vint16m1_t res = vmv_v_x_i16m1(0, 8); + res = vredsum_vs_i16m1_i16m1(res, t1, res, 8); + return vmv_x_s_i16m1_i16(res, 8); +} +inline int v_signmask(const v_uint16x8& a) +{ + vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8); + vint16m1_t m1 = (vint16m1_t){SMASK}; + vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8); + vint16m1_t res = vmv_v_x_i16m1(0, 8); + res = vredsum_vs_i16m1_i16m1(res, t1, res, 8); + return vmv_x_s_i16m1_i16(res, 8); +} +inline int v_signmask(const v_int32x4& a) +{ + vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4); + vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8}; + vint32m1_t res = vmv_v_x_i32m1(0, 4); + vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4); + res = vredsum_vs_i32m1_i32m1(res, t1, res, 4); + return vmv_x_s_i32m1_i32(res, 4); +} +inline int v_signmask(const v_uint32x4& a) +{ + vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4); + vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8}; + vint32m1_t res = vmv_v_x_i32m1(0, 4); + vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4); + res = vredsum_vs_i32m1_i32m1(res, t1, res, 4); + return vmv_x_s_i32m1_i32(res, 4); +} +inline int v_signmask(const v_uint64x2& a) +{ + vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2); + int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1); + return res; +} +inline int v_signmask(const v_int64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +inline int v_signmask(const v_float64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +inline int v_signmask(const v_float32x4& a) +{ + vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4); + vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8}; + vint32m1_t res = vmv_v_x_i32m1(0, 4); + vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4); + res = vredsum_vs_i32m1_i32m1(res, t1, res, 4); + return vmv_x_s_i32m1_i32(res, 4); +} + +inline int v_scan_forward(const v_int8x16& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_uint8x16& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_int16x8& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_uint16x8& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_int32x4& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_uint32x4& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_float32x4& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_int64x2& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } +inline int v_scan_forward(const v_uint64x2& a) { +int val = v_signmask(a); +if(val==0) return 0; +else return trailingZeros32(val); } + +#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \ +inline bool v_check_all(const v_##_Tpvec& a) \ +{ \ + suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \ + vuint64m1_t v1 = vuint64m1_t(v0); \ + return (v1[0] | v1[1]) == 0; \ +} \ +inline bool v_check_any(const v_##_Tpvec& a) \ +{ \ + suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \ + vuint64m1_t v1 = vuint64m1_t(v0); \ + return (v1[0] | v1[1]) != 0; \ +} + +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4) +OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2) + +inline bool v_check_all(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_all(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_all(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_all(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_all(const v_int64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_all(const v_float64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } + +inline bool v_check_any(const v_int8x16& a) +{ return v_check_any(v_reinterpret_as_u8(a)); } +inline bool v_check_any(const v_int16x8& a) +{ return v_check_any(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_int32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_float32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_int64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_float64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } + +#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \ +} + +OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8) +OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4) +inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4)); +} +inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b) +{ + return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2)); +} + +#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \ +inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \ +{ \ + _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \ + b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0); \ + b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1); \ +} \ +inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \ +{ \ + _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2); \ + return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \ +} \ +inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \ +{ \ + _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \ + return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \ +} \ +inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + _T2##_t val = vle##_v_##_Tp1(ptr, num2); \ + _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2); \ + return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \ +} + +OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1) +OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1) +OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1) +OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1) +OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1) +OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + vuint16m2_t b = vundefined_u16m2(); + vuint32m2_t c = vundefined_u32m2(); + vuint8m1_t val = vle_v_u8m1(ptr, 4); \ + b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \ + c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \ + return v_uint32x4(vget_u32m2_u32m1(c, 0)); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + vint16m2_t b = vundefined_i16m2(); + vint32m2_t c = vundefined_i32m2(); + vint8m1_t val = vle_v_i8m1(ptr, 4); \ + b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \ + c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \ + return v_int32x4(vget_i32m2_i32m1(c, 0)); +} +#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C} +#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006} +#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003} +#define VITL_2 (vuint64m2_t){0, 2, 1, 3} +#define LOW_4 0x0000000100000000, 0x0000000500000004 +#define LOW_8 0x0003000200010000, 0x000B000A00090008 +#define LOW_16 0x0706050403020100, 0x1716151413121110 +#define HIGH_4 0x0000000300000002, 0x0000000700000006 +#define HIGH_8 0x0007000600050004, 0x000F000E000D000C +#define HIGH_16 0x0F0E0D0C0B0A0908, 0x1F1E1D1C1B1A1918 +#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \ +inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \ +{ \ + v##_Tp##m2_t tmp = vundefined_##_T##m2();\ + tmp = vset_##_T##m2(tmp, 0, a0.val); \ + tmp = vset_##_T##m2(tmp, 1, a1.val); \ + vuint64m2_t mask = VITL_##num; \ + tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \ + b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \ + b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \ +} \ +inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \ + return v_##_Tpvec(b0);\ +} \ +inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \ + v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \ + v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \ + return v_##_Tpvec(b1);\ +} \ +inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \ + v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \ + v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \ + d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \ +} + +OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2) +OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1) + +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ + vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607}; + return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16)); +} +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ + vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607}; + return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16)); +} + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ + vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003}; + return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8)); +} + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ + vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003}; + return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8)); +} +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ + return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4)); +} + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ + return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4)); +} + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ + return v_uint64x2(a.val[1], a.val[0]); +} + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ + return v_int64x2(a.val[1], a.val[0]); +} + +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ + return v_float64x2(a.val[1], a.val[0]); +} + +#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ +{ return v_rotate_right(a, b);} +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2) +OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3) + + +#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \ +template inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; } + +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32) +OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64) + +#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \ +template inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); } + +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2) +OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4) +inline v_int32x4 v_round(const v_float32x4& a) +{ + __builtin_riscv_fsrm(0); + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} +inline v_int32x4 v_floor(const v_float32x4& a) +{ + __builtin_riscv_fsrm(2); + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + __builtin_riscv_fsrm(3); + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ + __builtin_riscv_fsrm(1); + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} + +inline v_int32x4 v_round(const v_float64x2& a) +{ + __builtin_riscv_fsrm(0); + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + //_val = vset_f64m2(_val, 1, a.val); + _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2)); + vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + __builtin_riscv_fsrm(0); + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + _val = vset_f64m2(_val, 1, b.val); + vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} +inline v_int32x4 v_floor(const v_float64x2& a) +{ + __builtin_riscv_fsrm(2); + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); + + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + __builtin_riscv_fsrm(3); + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); + + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ + __builtin_riscv_fsrm(1); + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); + + vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4); + vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4); + vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4); + __builtin_riscv_fsrm(0); + return v_int32x4(val); +} + +#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \ +{ \ + v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\ + a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \ + b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \ +{ \ + v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\ + a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \ + b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \ + c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \ +}\ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \ + v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \ +{ \ + v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\ + a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \ + b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \ + c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \ + d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \ +} \ + +#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \ + ret = vset_##_T##m1x2(ret, 0, a.val); \ + ret = vset_##_T##m1x2(ret, 1, b.val); \ + intrin##2e_v_##_T##m1x2(ptr, ret, num); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ + const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \ + ret = vset_##_T##m1x3(ret, 0, a.val); \ + ret = vset_##_T##m1x3(ret, 1, b.val); \ + ret = vset_##_T##m1x3(ret, 2, c.val); \ + intrin##3e_v_##_T##m1x3(ptr, ret, num); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ + const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \ + ret = vset_##_T##m1x4(ret, 0, a.val); \ + ret = vset_##_T##m1x4(ret, 1, b.val); \ + ret = vset_##_T##m1x4(ret, 2, c.val); \ + ret = vset_##_T##m1x4(ret, 3, d.val); \ + intrin##4e_v_##_T##m1x4(ptr, ret, num); \ +} + +#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \ +OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T) \ +OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T) + +//OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, ) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32) + +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32) + +#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \ +{ \ + v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \ + a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \ + b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \ +{ \ + v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num); \ + a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \ + b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \ + c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \ +}\ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \ + v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \ +{ \ + v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num); \ + a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \ + b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \ + c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \ + d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \ + ret = vset_##_T##m1x2(ret, 0, a.val); \ + ret = vset_##_T##m1x2(ret, 1, b.val); \ + vsseg2e_v_##_T##m1x2(ptr, ret, num); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ + const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \ + ret = vset_##_T##m1x3(ret, 0, a.val); \ + ret = vset_##_T##m1x3(ret, 1, b.val); \ + ret = vset_##_T##m1x3(ret, 2, c.val); \ + vsseg3e_v_##_T##m1x3(ptr, ret, num); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \ + const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \ + ret = vset_##_T##m1x4(ret, 0, a.val); \ + ret = vset_##_T##m1x4(ret, 1, b.val); \ + ret = vset_##_T##m1x4(ret, 2, c.val); \ + ret = vset_##_T##m1x4(ret, 3, d.val); \ + vsseg4e_v_##_T##m1x4(ptr, ret, num); \ +} +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64) + +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64) +OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64) + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4)); +} + +#if CV_SIMD128_64F +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2); + return v_float32x4(aval); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + vfloat64m2_t _val = vundefined_f64m2(); + _val = vset_f64m2(_val, 0, a.val); + _val = vset_f64m2(_val, 1, b.val); + vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4); + return v_float32x4(aval); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4); + vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4); + return v_float64x2(vget_f64m2_f64m1(_val, 0)); +} + +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ + vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4); + vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4); + return v_float64x2(vget_f64m2_f64m1(_val, 1)); +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4); + return v_float64x2(vget_f64m2_f64m1(_val, 0)); +} + +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ + vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4); + return v_float64x2(vget_f64m2_f64m1(_val, 1)); +} + +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ + return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2)); +} + +#endif +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08}; + return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16)); +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) +{ + return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); +} + +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08}; + return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16)); +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) +{ + return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); +} + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908}; + return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16)); +} +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504}; + return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); +} +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504}; + return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); +} +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A}; + return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); +} +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A}; + return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16)); +} +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + +#if CV_SIMD128_64F +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, + const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ + vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4); + vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2); + return v_float64x2(res); +} +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ v_float64x2 res = v_dotprod_expand_fast(a, b); + return res + c; } +#endif +////// FP16 support /////// +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4); + vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4); + return v_float32x4(vget_f32m2_f32m1(v32, 0)); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + vfloat32m2_t v32 = vundefined_f32m2(); + v32 = vset_f32m2(v32, 0, v.val); + vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4); + vse_v_f16m1((__fp16*)ptr, hv, 4); +} + + +inline void v_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} +#endif diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp index 0cddc90..f88597a 100644 --- a/modules/core/src/arithm.simd.hpp +++ b/modules/core/src/arithm.simd.hpp @@ -1910,4 +1910,4 @@ DEFINE_SIMD_ALL(recip, recip_loop) #define SIMD_GUARD #endif -}} // cv::hal:: \ No newline at end of file +}} // cv::hal:: diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index c68ca45..af4a621 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -395,6 +395,7 @@ struct HWFeatures g_hwFeatureNames[CPU_VSX3] = "VSX3"; g_hwFeatureNames[CPU_MSA] = "CPU_MSA"; + g_hwFeatureNames[CPU_RISCVV] = "RISCVV"; g_hwFeatureNames[CPU_AVX512_COMMON] = "AVX512-COMMON"; g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX"; @@ -590,6 +591,9 @@ struct HWFeatures #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800) have[CV_CPU_NEON] = true; #endif + #ifdef __riscv_vector + have[CV_CPU_RISCVV] = true; + #endif #ifdef __mips_msa have[CV_CPU_MSA] = true; #endif diff --git a/platforms/linux/riscv64-071-gcc.toolchain.cmake b/platforms/linux/riscv64-071-gcc.toolchain.cmake new file mode 100644 index 0000000..934680f --- /dev/null +++ b/platforms/linux/riscv64-071-gcc.toolchain.cmake @@ -0,0 +1,9 @@ +set(CMAKE_SYSTEM_NAME "Linux") +set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc) +set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++) + +set(CMAKE_CXX_FLAGS "" CACHE STRING "") +set(CMAKE_C_FLAGS "" CACHE STRING "") + +set(CMAKE_CXX_FLAGS "-static -march=rv64gcvxthead -mabi=lp64v -pthread") +set(CMAKE_C_FLAGS "-static -march=rv64gcvxthead -mabi=lp64v -pthread") -- 2.7.4