From 46faf592fcf406c5ddcd679a907473ec68ec51a7 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 11 Oct 2014 02:34:04 -0700 Subject: [PATCH] cv::warpAffine --- modules/imgproc/src/imgwarp.cpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 95e0be4..b8833a9 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -4423,13 +4423,27 @@ public: int Y0 = saturate_cast((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta; if( interpolation == INTER_NEAREST ) - for( x1 = 0; x1 < bw; x1++ ) + { + x1 = 0; + #if CV_NEON + int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0); + for( ; x1 <= bw - 4; x1 += 4 ) + { + int32x4_t v_X = vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); + int32x4_t v_Y = vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); + + vst1q_s16(xy + (x1 << 1), vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X, INTER_BITS)), + vqmovn_s32(vshrq_n_s32(v_Y, INTER_BITS)))); + } + #endif + for( ; x1 < bw; x1++ ) { int X = (X0 + adelta[x+x1]) >> AB_BITS; int Y = (Y0 + bdelta[x+x1]) >> AB_BITS; xy[x1*2] = saturate_cast(X); xy[x1*2+1] = saturate_cast(Y); } + } else { short* alpha = A + y1*bw; @@ -4467,6 +4481,18 @@ public: _mm_storeu_si128((__m128i*)(alpha + x1), fx_); } } + // #elif CV_NEON + // int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); + // for( ; x1 <= bw - 4; x1 += 4 ) + // { + // int32x4_t v_X = vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); + // int32x4_t v_Y = vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); + + // vst1q_s16(xy + (x1 << 1), vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X, INTER_BITS)), + // vqmovn_s32(vshrq_n_s32(v_Y, INTER_BITS)))); + // vst1_s16(alpha + x1, vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y, v_mask), INTER_BITS), + // vandq_s32(v_X, v_mask)))); + // } #endif for( ; x1 < bw; x1++ ) { -- 2.7.4