From: Ilya Lavrenov Date: Wed, 8 Oct 2014 19:50:29 +0000 (-0700) Subject: cv::resize (INTER_LINEAR && INTER_CUBIC) X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~2890^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=183e378bd069796f30e69c4d597b743b1f6cf003;p=platform%2Fupstream%2Fopencv.git cv::resize (INTER_LINEAR && INTER_CUBIC) --- diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index f4c2cf2..a0b19df 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -49,8 +49,6 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" -#include - #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) static IppStatus sts = ippInit(); #endif @@ -896,6 +894,183 @@ struct VResizeCubicVec_32f } }; +#elif CV_NEON + +typedef VResizeNoVec VResizeLinearVec_32s8u; + +struct VResizeLinearVec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); + float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + short* dst = (short*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); + float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); + vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); + } + + return x; + } +}; + +typedef VResizeNoVec VResizeCubicVec_32s8u; + +struct VResizeCubicVec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeCubicVec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + short* dst = (short*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeCubicVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x))); + vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4))); + } + + return x; + } +}; + #else typedef VResizeNoVec VResizeLinearVec_32s8u;