From: Yury Gorbachev Date: Mon, 4 Aug 2014 12:44:51 +0000 (+0400) Subject: ARM NEON accelerated implementation of cv::addWeighted, cv::inRange and cv::compare... X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~206^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=61423a2d4774228f6c484569329cd64087e4acb8;p=profile%2Fivi%2Fopencv.git ARM NEON accelerated implementation of cv::addWeighted, cv::inRange and cv::compare functions. NOT verified on target platform, compilation passes with and without NEON. --- diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 7ac3672..4c6ba1c 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -2440,6 +2440,34 @@ addWeighted8u( const uchar* src1, size_t step1, _mm_storel_epi64((__m128i*)(dst + x), u); } } +#elif CV_NEON + float32x4_t g = vdupq_n_f32 (gamma); + + for( ; x <= size.width - 8; x += 8 ) + { + uint8x8_t in1 = vld1_u8(src1+x); + uint16x8_t in1_16 = vmovl_u8(in1); + float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); + float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); + + uint8x8_t in2 = vld1_u8(src2+x); + uint16x8_t in2_16 = vmovl_u8(in2); + float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); + float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); + + float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); + float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); + out_f_l = vaddq_f32(out_f_l, g); + out_f_h = vaddq_f32(out_f_h, g); + + uint16x4_t out_16_l = vqmovn_u32(vcvtq_u32_f32(out_f_l)); + uint16x4_t out_16_h = vqmovn_u32(vcvtq_u32_f32(out_f_h)); + + uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); + uint8x8_t out = vqmovn_u16(out_16); + + vst1_u8(dst+x, out); + } #endif #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) @@ -2650,6 +2678,14 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste } } + #elif CV_NEON + uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= size.width - 16; x += 16 ) + { + vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); + } + #endif for( ; x < size.width; x++ ){ @@ -2674,6 +2710,13 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste _mm_storeu_si128((__m128i*)(dst + x), r00); } } + #elif CV_NEON + uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= size.width - 16; x += 16 ) + { + vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); + } #endif for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); @@ -2759,6 +2802,22 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } + #elif CV_NEON + uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= size.width - 16; x += 16 ) + { + int16x8_t in1 = vld1q_s16(src1 + x); + int16x8_t in2 = vld1q_s16(src2 + x); + uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); + + in1 = vld1q_s16(src1 + x + 8); + in2 = vld1q_s16(src2 + x + 8); + uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); + + vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); + } + #endif for( ; x < size.width; x++ ){ @@ -2797,6 +2856,21 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } + #elif CV_NEON + uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= size.width - 16; x += 16 ) + { + int16x8_t in1 = vld1q_s16(src1 + x); + int16x8_t in2 = vld1q_s16(src2 + x); + uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); + + in1 = vld1q_s16(src1 + x + 8); + in2 = vld1q_s16(src2 + x + 8); + uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); + + vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); + } #endif for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); @@ -3085,7 +3159,7 @@ namespace cv { template -struct InRange_SSE +struct InRange_SIMD { int operator () (const T *, const T *, const T *, uchar *, int) const { @@ -3096,7 +3170,7 @@ struct InRange_SSE #if CV_SSE2 template <> -struct InRange_SSE +struct InRange_SIMD { int operator () (const uchar * src1, const uchar * src2, const uchar * src3, uchar * dst, int len) const @@ -3121,7 +3195,7 @@ struct InRange_SSE }; template <> -struct InRange_SSE +struct InRange_SIMD { int operator () (const schar * src1, const schar * src2, const schar * src3, uchar * dst, int len) const @@ -3146,7 +3220,7 @@ struct InRange_SSE }; template <> -struct InRange_SSE +struct InRange_SIMD { int operator () (const ushort * src1, const ushort * src2, const ushort * src3, uchar * dst, int len) const @@ -3172,7 +3246,7 @@ struct InRange_SSE }; template <> -struct InRange_SSE +struct InRange_SIMD { int operator () (const short * src1, const short * src2, const short * src3, uchar * dst, int len) const @@ -3198,7 +3272,7 @@ struct InRange_SSE }; template <> -struct InRange_SSE +struct InRange_SIMD { int operator () (const int * src1, const int * src2, const int * src3, uchar * dst, int len) const @@ -3230,7 +3304,7 @@ struct InRange_SSE }; template <> -struct InRange_SSE +struct InRange_SIMD { int operator () (const float * src1, const float * src2, const float * src3, uchar * dst, int len) const @@ -3261,6 +3335,160 @@ struct InRange_SSE } }; +#elif CV_NEON + +template <> +struct InRange_SIMD +{ + int operator () (const uchar * src1, const uchar * src2, const uchar * src3, + uchar * dst, int len) const + { + int x = 0; + + for ( ; x <= len - 16; x += 16 ) + { + uint8x16_t values = vld1q_u8(src1 + x); + uint8x16_t low = vld1q_u8(src2 + x); + uint8x16_t high = vld1q_u8(src3 + x); + + vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values))); + } + return x; + } +}; + +template <> +struct InRange_SIMD +{ + int operator () (const schar * src1, const schar * src2, const schar * src3, + uchar * dst, int len) const + { + int x = 0; + + for ( ; x <= len - 16; x += 16 ) + { + int8x16_t values = vld1q_s8(src1 + x); + int8x16_t low = vld1q_s8(src2 + x); + int8x16_t high = vld1q_s8(src3 + x); + + vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values))); + } + return x; + } +}; + +template <> +struct InRange_SIMD +{ + int operator () (const ushort * src1, const ushort * src2, const ushort * src3, + uchar * dst, int len) const + { + int x = 0; + + for ( ; x <= len - 16; x += 16 ) + { + uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x)); + uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x)); + uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x)); + uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values))); + + values = vld1q_u16((const uint16_t*)(src1 + x + 8)); + low = vld1q_u16((const uint16_t*)(src2 + x + 8)); + high = vld1q_u16((const uint16_t*)(src3 + x + 8)); + uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values))); + + vst1q_u8(dst + x, vcombine_u8(r1, r2)); + } + return x; + } +}; + +template <> +struct InRange_SIMD +{ + int operator () (const short * src1, const short * src2, const short * src3, + uchar * dst, int len) const + { + int x = 0; + + for ( ; x <= len - 16; x += 16 ) + { + int16x8_t values = vld1q_s16((const int16_t*)(src1 + x)); + int16x8_t low = vld1q_s16((const int16_t*)(src2 + x)); + int16x8_t high = vld1q_s16((const int16_t*)(src3 + x)); + uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values))); + + values = vld1q_s16((const int16_t*)(src1 + x + 8)); + low = vld1q_s16((const int16_t*)(src2 + x + 8)); + high = vld1q_s16((const int16_t*)(src3 + x + 8)); + uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values))); + + vst1q_u8(dst + x, vcombine_u8(r1, r2)); + } + return x; + } +}; + +template <> +struct InRange_SIMD +{ + int operator () (const int * src1, const int * src2, const int * src3, + uchar * dst, int len) const + { + int x = 0; + + for ( ; x <= len - 8; x += 8 ) + { + int32x4_t values = vld1q_s32((const int32_t*)(src1 + x)); + int32x4_t low = vld1q_s32((const int32_t*)(src2 + x)); + int32x4_t high = vld1q_s32((const int32_t*)(src3 + x)); + + uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values))); + + values = vld1q_s32((const int32_t*)(src1 + x + 4)); + low = vld1q_s32((const int32_t*)(src2 + x + 4)); + high = vld1q_s32((const int32_t*)(src3 + x + 4)); + + uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values))); + + uint16x8_t res_16 = vcombine_u16(r1, r2); + + vst1_u8(dst + x, vmovn_u16(res_16)); + } + return x; + } +}; + +template <> +struct InRange_SIMD +{ + int operator () (const float * src1, const float * src2, const float * src3, + uchar * dst, int len) const + { + int x = 0; + + for ( ; x <= len - 8; x += 8 ) + { + float32x4_t values = vld1q_f32((const float32_t*)(src1 + x)); + float32x4_t low = vld1q_f32((const float32_t*)(src2 + x)); + float32x4_t high = vld1q_f32((const float32_t*)(src3 + x)); + + uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values))); + + values = vld1q_f32((const float32_t*)(src1 + x + 4)); + low = vld1q_f32((const float32_t*)(src2 + x + 4)); + high = vld1q_f32((const float32_t*)(src3 + x + 4)); + + uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values))); + + uint16x8_t res_16 = vcombine_u16(r1, r2); + + vst1_u8(dst + x, vmovn_u16(res_16)); + } + return x; + } +}; + #endif template @@ -3272,7 +3500,7 @@ static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step3 /= sizeof(src3[0]); - InRange_SSE vop; + InRange_SIMD vop; for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step ) {