From 236c64a17d4c9e1eac3fb25044bdfeceba1d7463 Mon Sep 17 00:00:00 2001 From: Nicholas Ho <88894303+Nicholas-Ho-arm@users.noreply.github.com> Date: Sat, 25 Sep 2021 18:43:33 +0100 Subject: [PATCH] Merge pull request #20712 from Nicholas-Ho-arm:3.4_RowVec_8u32f * Add RowVec_8u32f * Fix build errors in Linux x64 Debug and armeabi-v7a * Reformat code to make it more clean and conventional * Optimise with vx_load_expand_q() --- modules/imgproc/src/filter.simd.hpp | 47 ++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index bed6f83..94dbce0 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -465,6 +465,49 @@ struct RowVec_8u32s bool smallValues; }; +struct RowVec_8u32f +{ + RowVec_8u32f() {} + RowVec_8u32f( const Mat& _kernel ) : kernel(_kernel) {} + + int operator()(const uchar* _src, uchar* _dst, int width, int cn) const + { + CV_INSTRUMENT_REGION(); + + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; + float* dst = (float*)_dst; + const float* _kx = kernel.ptr(); + width *= cn; + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + { + v_float32 s0 = vx_setzero_f32(); + v_float32 s1 = vx_setzero_f32(); + v_float32 s2 = vx_setzero_f32(); + v_float32 s3 = vx_setzero_f32(); + k = 0; + for( ; k < _ksize ; k++ ) + { + v_float32 f = vx_setall_f32(_kx[k]); + const uchar* src = (const uchar*)_src + i + k * cn; + v_float32 vs_ll = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src))); + v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + v_float32::nlanes))); + v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*v_float32::nlanes))); + v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*v_float32::nlanes))); + s0 = v_muladd(vs_ll, f, s0); + s1 = v_muladd(vs_lh, f, s1); + s2 = v_muladd(vs_hl, f, s2); + s3 = v_muladd(vs_hh, f, s3); + } + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + v_store(dst + i + 2*v_float32::nlanes, s2); + v_store(dst + i + 3*v_float32::nlanes, s3); + } + return i; + } + + Mat kernel; +}; struct SymmRowSmallVec_8u32s { @@ -2292,6 +2335,7 @@ struct FilterVec_32f #else typedef RowNoVec RowVec_8u32s; +typedef RowNoVec RowVec_8u32f; typedef RowNoVec RowVec_16s32f; typedef RowNoVec RowVec_32f; typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s; @@ -2899,7 +2943,8 @@ Ptr getLinearRowFilter( return makePtr > (kernel, anchor, RowVec_8u32s(kernel)); if( sdepth == CV_8U && ddepth == CV_32F ) - return makePtr >(kernel, anchor); + return makePtr > + (kernel, anchor, RowVec_8u32f(kernel)); if( sdepth == CV_8U && ddepth == CV_64F ) return makePtr >(kernel, anchor); if( sdepth == CV_16U && ddepth == CV_32F ) -- 2.7.4