From 9ca924999280a6919d5e313d9d31e2799b622227 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Fri, 11 Oct 2019 12:32:59 -0300 Subject: [PATCH] Merge pull request #15527 from everton1984:faster_acc * Adding support for vectorized masking for uchar/ushort. * Fixing bug where mask was zeroing the dst. Improved the way to calculate the mask and tweaked for further performance improvements. * Fixing mask comparison test. * Restricting to one channel. * Adding support for 3 channels, switch old approach to start using HAL's v_select. --- modules/imgproc/src/accum.simd.hpp | 194 ++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp index 7bca93d..6b0e6d6 100644 --- a/modules/imgproc/src/accum.simd.hpp +++ b/modules/imgproc/src/accum.simd.hpp @@ -2624,11 +2624,127 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha); v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha); - v_store(dst + x, v_dst00); - v_store(dst + x + step, v_dst01); + v_store(dst + x , v_dst00); + v_store(dst + x + step , v_dst01); v_store(dst + x + step * 2, v_dst10); v_store(dst + x + step * 3, v_dst11); } + } else { + const v_float32 zero = vx_setall_f32((float)0); + int size = len * cn; + + if ( cn == 1 ){ + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8 v_src = vx_load(src + x); + v_uint8 v_mask = vx_load(mask + x); + + v_uint16 v_m0, v_m1; + v_expand(v_mask, v_m0, v_m1); + v_uint32 v_m00, v_m01, v_m10, v_m11; + v_expand(v_m0, v_m00, v_m01); + v_expand(v_m1, v_m10, v_m11); + + v_float32 v_mf00, v_mf01, v_mf10, v_mf11; + v_mf00 = v_cvt_f32(v_reinterpret_as_s32(v_m00)); + v_mf01 = v_cvt_f32(v_reinterpret_as_s32(v_m01)); + v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10)); + v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11)); + + v_uint16 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_uint32 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_float32 v_dst00 = vx_load(dst + x); + v_float32 v_dst01 = vx_load(dst + x + step); + v_float32 v_dst10 = vx_load(dst + x + step * 2); + v_float32 v_dst11 = vx_load(dst + x + step * 3); + + v_mf00 = v_mf00 != zero; + v_mf01 = v_mf01 != zero; + v_mf10 = v_mf10 != zero; + v_mf11 = v_mf11 != zero; + + v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00); + v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01); + v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10); + v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11); + + v_store(dst + x , v_dst00); + v_store(dst + x + step , v_dst01); + v_store(dst + x + step * 2, v_dst10); + v_store(dst + x + step * 3, v_dst11); + } + } else if ( cn == 3 ) + { + for (; x*cn <= size - cVectorWidth*cn; x += cVectorWidth ) + { + v_uint8 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + + v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + v_expand(v_src2, v_src20, v_src21); + + v_uint32 v_src000, v_src001, v_src010, v_src011, v_src100, v_src101, v_src110, v_src111, v_src200, v_src201, v_src210, v_src211; + v_expand(v_src00, v_src000, v_src001); + v_expand(v_src01, v_src010, v_src011); + v_expand(v_src10, v_src100, v_src101); + v_expand(v_src11, v_src110, v_src111); + v_expand(v_src20, v_src200, v_src201); + v_expand(v_src21, v_src210, v_src211); + + v_float32 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13; + v_float32 v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn , v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + 2 * step) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + 3 * step) * cn, v_dst03, v_dst13, v_dst23); + + v_uint8 v_mask = vx_load(mask + x); + + v_uint16 v_m0, v_m1; + v_expand(v_mask, v_m0, v_m1); + v_uint32 v_m00, v_m01, v_m10, v_m11; + v_expand(v_m0, v_m00, v_m01); + v_expand(v_m1, v_m10, v_m11); + + v_float32 v_mf00, v_mf01, v_mf10, v_mf11; + v_mf00 = v_cvt_f32(v_reinterpret_as_s32(v_m00)); + v_mf01 = v_cvt_f32(v_reinterpret_as_s32(v_m01)); + v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10)); + v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11)); + + v_mf00 = v_mf00 != zero; + v_mf01 = v_mf01 != zero; + v_mf10 = v_mf10 != zero; + v_mf11 = v_mf11 != zero; + + v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src000)) * v_alpha), v_dst00); + v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src001)) * v_alpha), v_dst01); + v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src010)) * v_alpha), v_dst02); + v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src011)) * v_alpha), v_dst03); + + v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src100)) * v_alpha), v_dst10); + v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src101)) * v_alpha), v_dst11); + v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src110)) * v_alpha), v_dst12); + v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src111)) * v_alpha), v_dst13); + + v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src200)) * v_alpha), v_dst20); + v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src201)) * v_alpha), v_dst21); + v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src210)) * v_alpha), v_dst22); + v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src211)) * v_alpha), v_dst23); + + v_store_interleave(dst + x * cn , v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + ( x + step ) * cn, v_dst01, v_dst11, v_dst21); + v_store_interleave(dst + ( x + step * 2 ) * cn, v_dst02, v_dst12, v_dst22); + v_store_interleave(dst + ( x + step * 3 ) * cn, v_dst03, v_dst13, v_dst23); + } + } } #endif // CV_SIMD accW_general_(src, dst, mask, len, cn, alpha, x); @@ -2657,9 +2773,81 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha); v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha); - v_store(dst + x, v_dst0); + v_store(dst + x , v_dst0); v_store(dst + x + step, v_dst1); } + } else { + const v_float32 zero = vx_setall_f32((float)0); + int size = len * cn; + if ( cn == 1 ) + { + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16 v_src = vx_load(src + x); + v_uint16 v_mask = v_reinterpret_as_u16(vx_load_expand(mask + x)); + + v_uint32 v_m0, v_m1; + v_expand(v_mask, v_m0, v_m1); + + v_float32 v_mf0, v_mf1; + v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0)); + v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1)); + + v_uint32 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_float32 v_dst0 = vx_load(dst + x); + v_float32 v_dst1 = vx_load(dst + x + step); + + v_mf0 = v_mf0 != zero; + v_mf1 = v_mf1 != zero; + + v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src0)) * v_alpha), v_dst0); + v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src1)) * v_alpha), v_dst1); + + v_store(dst + x , v_dst0); + v_store(dst + x + step, v_dst1); + } + } else if ( cn == 3 ) + { + for (; x*cn <= size - cVectorWidth*cn; x += cVectorWidth ) + { + v_uint16 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + + v_uint16 v_mask = v_reinterpret_as_u16(vx_load_expand(mask + x)); + + v_uint32 v_m0, v_m1; + v_expand(v_mask, v_m0, v_m1); + + v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + v_expand(v_src2, v_src20, v_src21); + + v_float32 v_dst00, v_dst01, v_dst02, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn , v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21); + + v_float32 v_mf0, v_mf1; + v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0)); + v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1)); + + v_mf0 = v_mf0 != zero; + v_mf1 = v_mf1 != zero; + + v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00); + v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10); + v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src20)) * v_alpha), v_dst20); + + v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01); + v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11); + v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src21)) * v_alpha), v_dst21); + + v_store_interleave(dst + x * cn , v_dst00, v_dst10, v_dst20); + v_store_interleave(dst + ( x + step ) * cn, v_dst01, v_dst11, v_dst21); + } + } } #endif // CV_SIMD accW_general_(src, dst, mask, len, cn, alpha, x); -- 2.7.4