From a54f6bb08bbd63c16449f685ce1e9cdf8f10ac13 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 5 Oct 2014 09:48:31 -0700 Subject: [PATCH] cv::cvtColor --- modules/imgproc/perf/perf_blur.cpp | 7 +- modules/imgproc/src/color.cpp | 1375 +++++++++++++++++++++++++++++++++++- modules/imgproc/src/imgwarp.cpp | 8 - 3 files changed, 1369 insertions(+), 21 deletions(-) diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp index a6e31d3..3fc953e 100644 --- a/modules/imgproc/perf/perf_blur.cpp +++ b/modules/imgproc/perf/perf_blur.cpp @@ -98,6 +98,11 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16, Size size = get<0>(GetParam()); int type = get<1>(GetParam()); BorderType btype = get<2>(GetParam()); + double eps = 1e-3; + +#if CV_NEON + eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps; +#endif Mat src(size, type); Mat dst(size, type); @@ -106,7 +111,7 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16, TEST_CYCLE() blur(src, dst, Size(16,16), Point(-1,-1), btype); - SANITY_CHECK(dst, 1e-3); + SANITY_CHECK(dst, eps); } PERF_TEST_P(Size_MatType_BorderType3x3, box3x3, diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 78236e0..27da795 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -769,7 +769,6 @@ template struct RGB2Gray float coeffs[3]; }; - template<> struct RGB2Gray { typedef uchar channel_type; @@ -800,6 +799,166 @@ template<> struct RGB2Gray int tab[256*3]; }; +#if CV_NEON + +template <> +struct RGB2Gray +{ + typedef ushort channel_type; + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : + srccn(_srccn) + { + static const int coeffs0[] = { R2Y, G2Y, B2Y }; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); + if( blueIdx == 0 ) + std::swap(coeffs[0], coeffs[2]); + + v_cb = vdup_n_u16(coeffs[0]); + v_cg = vdup_n_u16(coeffs[1]); + v_cr = vdup_n_u16(coeffs[2]); + v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; + + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + uint16x8_t v_b, v_r, v_g; + if (scn == 3) + { + uint16x8x3_t v_src = vld3q_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + else + { + uint16x8x4_t v_src = vld4q_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + + uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16( + vmull_u16(vget_low_u16(v_b), v_cb), + vget_low_u16(v_g), v_cg), + vget_low_u16(v_r), v_cr); + uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16( + vmull_u16(vget_high_u16(v_b), v_cb), + vget_high_u16(v_g), v_cg), + vget_high_u16(v_r), v_cr); + + uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift)); + uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift)); + + vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1)); + } + + for ( ; i <= n - 4; i += 4, src += scn * 4) + { + uint16x4_t v_b, v_r, v_g; + if (scn == 3) + { + uint16x4x3_t v_src = vld3_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + else + { + uint16x4x4_t v_src = vld4_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + + uint32x4_t v_dst = vmlal_u16(vmlal_u16( + vmull_u16(v_b, v_cb), + v_g, v_cg), + v_r, v_cr); + + vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift))); + } + + for( ; i < n; i++, src += scn) + dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); + } + + int srccn, coeffs[3]; + uint16x4_t v_cb, v_cg, v_cr; + uint32x4_t v_delta; +}; + +template <> +struct RGB2Gray +{ + typedef float channel_type; + + RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const float coeffs0[] = { 0.299f, 0.587f, 0.114f }; + memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + + v_cb = vdupq_n_f32(coeffs[0]); + v_cg = vdupq_n_f32(coeffs[1]); + v_cr = vdupq_n_f32(coeffs[2]); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, i = 0; + float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + + if (scn == 3) + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + float32x4x3_t v_src = vld3q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + + v_src = vld3q_f32(src + scn * 4); + vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + + for ( ; i <= n - 4; i += 4, src += scn * 4) + { + float32x4x3_t v_src = vld3q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + } + else + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + float32x4x4_t v_src = vld4q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + + v_src = vld4q_f32(src + scn * 4); + vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + + for ( ; i <= n - 4; i += 4, src += scn * 4) + { + float32x4x4_t v_src = vld4q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + } + + for ( ; i < n; i++, src += scn) + dst[i] = src[0]*cb + src[1]*cg + src[2]*cr; + } + + int srccn; + float coeffs[3]; + float32x4_t v_cb, v_cg, v_cr; +}; + +#else template<> struct RGB2Gray { @@ -823,6 +982,7 @@ template<> struct RGB2Gray int coeffs[3]; }; +#endif ///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// @@ -855,6 +1015,72 @@ template struct RGB2YCrCb_f float coeffs[5]; }; +#if CV_NEON + +template <> +struct RGB2YCrCb_f +{ + typedef float channel_type; + + RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : + srccn(_srccn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if(blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = vdupq_n_f32(coeffs[0]); + v_c1 = vdupq_n_f32(coeffs[1]); + v_c2 = vdupq_n_f32(coeffs[2]); + v_c3 = vdupq_n_f32(coeffs[3]); + v_c4 = vdupq_n_f32(coeffs[4]); + v_delta = vdupq_n_f32(ColorChannel::half()); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + n *= 3; + + if (scn == 3) + for ( ; i <= n - 12; i += 12, src += 12) + { + float32x4x3_t v_src = vld3q_f32(src), v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3); + v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4); + + vst3q_f32(dst + i, v_dst); + } + else + for ( ; i <= n - 12; i += 12, src += 16) + { + float32x4x4_t v_src = vld4q_f32(src); + float32x4x3_t v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3); + v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4); + + vst3q_f32(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; + float Cr = (src[bidx^2] - Y)*C3 + delta; + float Cb = (src[bidx] - Y)*C4 + delta; + dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb; + } + } + int srccn, blueIdx; + float coeffs[5]; + float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; +}; + +#endif template struct RGB2YCrCb_i { @@ -887,6 +1113,224 @@ template struct RGB2YCrCb_i int coeffs[5]; }; +#if CV_NEON + +template <> +struct RGB2YCrCb_i +{ + typedef uchar channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = vdup_n_s16(coeffs[0]); + v_c1 = vdup_n_s16(coeffs[1]); + v_c2 = vdup_n_s16(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_c4 = vdupq_n_s32(coeffs[4]); + v_delta = vdupq_n_s32(ColorChannel::half()*(1 << yuv_shift)); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint8x8x3_t v_dst; + int16x8x3_t v_src16; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + } + + int16x4x3_t v_src0; + v_src0.val[0] = vget_low_s16(v_src16.val[0]); + v_src0.val[1] = vget_low_s16(v_src16.val[1]); + v_src0.val[2] = vget_low_s16(v_src16.val[2]); + + int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift); + int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3); + v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift); + int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4); + v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift); + + v_src0.val[0] = vget_high_s16(v_src16.val[0]); + v_src0.val[1] = vget_high_s16(v_src16.val[1]); + v_src0.val[2] = vget_high_s16(v_src16.val[2]); + + int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift); + int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3); + v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift); + int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4); + v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift); + + v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1))); + v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1))); + v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1))); + + vst3_u8(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + int srccn, blueIdx, coeffs[5]; + int16x4_t v_c0, v_c1, v_c2; + int32x4_t v_c3, v_c4, v_delta, v_delta2; +}; + +template <> +struct RGB2YCrCb_i +{ + typedef ushort channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_c4 = vdupq_n_s32(coeffs[4]); + v_delta = vdupq_n_s32(ColorChannel::half()*(1 << yuv_shift)); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + } + + void operator()(const ushort * src, ushort * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint16x8x3_t v_src, v_dst; + int32x4x3_t v_src0; + + if (scn == 3) + v_src = vld3q_u16(src); + else + { + uint16x8x4_t v_src_ = vld4q_u16(src); + v_src.val[0] = v_src_.val[0]; + v_src.val[1] = v_src_.val[1]; + v_src.val[2] = v_src_.val[2]; + } + + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); + + int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift); + int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3); + v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift); + int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4); + v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift); + + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); + + int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift); + int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3); + v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift); + int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4); + v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift); + + v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1)); + v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1)); + v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1)); + + vst3q_u16(dst + i, v_dst); + } + + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + uint16x4x3_t v_dst; + int32x4x3_t v_src0; + + if (scn == 3) + { + uint16x4x3_t v_src = vld3_u16(src); + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + } + else + { + uint16x4x4_t v_src = vld4_u16(src); + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + } + + int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift); + int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3); + v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift); + int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4); + v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift); + + v_dst.val[0] = vqmovun_s32(v_Y); + v_dst.val[1] = vqmovun_s32(v_Cr); + v_dst.val[2] = vqmovun_s32(v_Cb); + + vst3_u16(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + int srccn, blueIdx, coeffs[5]; + int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; +}; + +#endif template struct YCrCb2RGB_f { @@ -923,6 +1367,80 @@ template struct YCrCb2RGB_f float coeffs[4]; }; +#if CV_NEON + +template <> +struct YCrCb2RGB_f +{ + typedef float channel_type; + + YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = vdupq_n_f32(coeffs[0]); + v_c1 = vdupq_n_f32(coeffs[1]); + v_c2 = vdupq_n_f32(coeffs[2]); + v_c3 = vdupq_n_f32(coeffs[3]); + v_delta = vdupq_n_f32(ColorChannel::half()); + v_alpha = vdupq_n_f32(ColorChannel::max()); + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(), alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (dcn == 3) + for ( ; i <= n - 12; i += 12, dst += 12) + { + float32x4x3_t v_src = vld3q_f32(src + i), v_dst; + float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2]; + + v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3); + v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y); + v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0); + + vst3q_f32(dst, v_dst); + } + else + for ( ; i <= n - 12; i += 12, dst += 16) + { + float32x4x3_t v_src = vld3q_f32(src + i); + float32x4x4_t v_dst; + float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2]; + + v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3); + v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y); + v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0); + v_dst.val[3] = v_alpha; + + vst4q_f32(dst, v_dst); + } + + for ( ; i < n; i += 3, dst += dcn) + { + float Y = src[i], Cr = src[i+1], Cb = src[i+2]; + + float b = Y + (Cb - delta)*C3; + float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; + float r = Y + (Cr - delta)*C0; + + dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + float coeffs[4]; + float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; +}; + +#endif template struct YCrCb2RGB_i { @@ -962,20 +1480,268 @@ template struct YCrCb2RGB_i int coeffs[4]; }; +#if CV_NEON -////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// - -static const float sRGB2XYZ_D65[] = +template <> +struct YCrCb2RGB_i { - 0.412453f, 0.357580f, 0.180423f, - 0.212671f, 0.715160f, 0.072169f, - 0.019334f, 0.119193f, 0.950227f -}; + typedef uchar channel_type; -static const float XYZ2sRGB_D65[] = -{ - 3.240479f, -1.53715f, -0.498535f, - -0.969256f, 1.875991f, 0.041556f, + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_delta = vdup_n_s16(ColorChannel::half()); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + v_alpha = vdup_n_u8(ColorChannel::max()); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint8x8x3_t v_src = vld3_u8(src + i); + int16x8x3_t v_src16; + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + + int16x4_t v_Y = vget_low_s16(v_src16.val[0]), + v_Cr = vget_low_s16(v_src16.val[1]), + v_Cb = vget_low_s16(v_src16.val[2]); + + int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta)); + v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y); + int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2); + v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y); + int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta)); + v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y); + + v_Y = vget_high_s16(v_src16.val[0]); + v_Cr = vget_high_s16(v_src16.val[1]); + v_Cb = vget_high_s16(v_src16.val[2]); + + int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta)); + v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y); + int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2); + v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y); + int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta)); + v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y); + + uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1))); + uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1))); + uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1))); + + if (dcn == 3) + { + uint8x8x3_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + vst3_u8(dst, v_dst); + } + else + { + uint8x8x4_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + uchar Y = src[i]; + uchar Cr = src[i+1]; + uchar Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + + int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2; + int16x4_t v_delta; + uint8x8_t v_alpha; +}; + +template <> +struct YCrCb2RGB_i +{ + typedef ushort channel_type; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_delta = vdupq_n_s32(ColorChannel::half()); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + v_alpha = vdupq_n_u16(ColorChannel::max()); + v_alpha2 = vget_low_u16(v_alpha); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint16x8x3_t v_src = vld3q_u16(src + i); + + int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))), + v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))), + v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); + + int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); + v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y); + int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); + v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y); + int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta)); + v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y); + + v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))), + v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))), + v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); + + int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); + v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y); + int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); + v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y); + int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta)); + v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y); + + uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1)); + uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1)); + uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1)); + + if (dcn == 3) + { + uint16x8x3_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + vst3q_u16(dst, v_dst); + } + else + { + uint16x8x4_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + v_dst.val[3] = v_alpha; + vst4q_u16(dst, v_dst); + } + } + + for ( ; i <= n - 12; i += 12, dst += dcn * 4) + { + uint16x4x3_t v_src = vld3_u16(src + i); + + int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])), + v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])), + v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + + int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); + v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y); + int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); + v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y); + int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0); + v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y); + + uint16x4_t v_bd = vqmovun_s32(v_b); + uint16x4_t v_gd = vqmovun_s32(v_g); + uint16x4_t v_rd = vqmovun_s32(v_r); + + if (dcn == 3) + { + uint16x4x3_t v_dst; + v_dst.val[bidx] = v_bd; + v_dst.val[1] = v_gd; + v_dst.val[bidx^2] = v_rd; + vst3_u16(dst, v_dst); + } + else + { + uint16x4x4_t v_dst; + v_dst.val[bidx] = v_bd; + v_dst.val[1] = v_gd; + v_dst.val[bidx^2] = v_rd; + v_dst.val[3] = v_alpha2; + vst4_u16(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + ushort Y = src[i]; + ushort Cr = src[i+1]; + ushort Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + + int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta; + uint16x8_t v_alpha; + uint16x4_t v_alpha2; +}; + +#endif + +////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// + +static const float sRGB2XYZ_D65[] = +{ + 0.412453f, 0.357580f, 0.180423f, + 0.212671f, 0.715160f, 0.072169f, + 0.019334f, 0.119193f, 0.950227f +}; + +static const float XYZ2sRGB_D65[] = +{ + 3.240479f, -1.53715f, -0.498535f, + -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f }; @@ -1013,6 +1779,78 @@ template struct RGB2XYZ_f float coeffs[9]; }; +#if CV_NEON + +template <> +struct RGB2XYZ_f +{ + typedef float channel_type; + + RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0])); + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = vdupq_n_f32(coeffs[0]); + v_c1 = vdupq_n_f32(coeffs[1]); + v_c2 = vdupq_n_f32(coeffs[2]); + v_c3 = vdupq_n_f32(coeffs[3]); + v_c4 = vdupq_n_f32(coeffs[4]); + v_c5 = vdupq_n_f32(coeffs[5]); + v_c6 = vdupq_n_f32(coeffs[6]); + v_c7 = vdupq_n_f32(coeffs[7]); + v_c8 = vdupq_n_f32(coeffs[8]); + } + + void operator()(const float* src, float* dst, int n) const + { + int scn = srccn, i = 0; + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + + n *= 3; + + if (scn == 3) + for ( ; i <= n - 12; i += 12, src += 12) + { + float32x4x3_t v_src = vld3q_f32(src), v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5); + v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8); + vst3q_f32(dst + i, v_dst); + } + else + for ( ; i <= n - 12; i += 12, src += 16) + { + float32x4x4_t v_src = vld4q_f32(src); + float32x4x3_t v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5); + v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8); + vst3q_f32(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + float X = saturate_cast(src[0]*C0 + src[1]*C1 + src[2]*C2); + float Y = saturate_cast(src[0]*C3 + src[1]*C4 + src[2]*C5); + float Z = saturate_cast(src[0]*C6 + src[1]*C7 + src[2]*C8); + dst[i] = X; dst[i+1] = Y; dst[i+2] = Z; + } + } + + int srccn; + float coeffs[9]; + float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; +}; + +#endif template struct RGB2XYZ_i { @@ -1055,6 +1893,247 @@ template struct RGB2XYZ_i int coeffs[9]; }; +#if CV_NEON + +template <> +struct RGB2XYZ_i +{ + typedef uchar channel_type; + + RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const int coeffs0[] = + { + 1689, 1465, 739, + 871, 2929, 296, + 79, 488, 3892 + }; + for( int i = 0; i < 9; i++ ) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = vdup_n_u16(coeffs[0]); + v_c1 = vdup_n_u16(coeffs[1]); + v_c2 = vdup_n_u16(coeffs[2]); + v_c3 = vdup_n_u16(coeffs[3]); + v_c4 = vdup_n_u16(coeffs[4]); + v_c5 = vdup_n_u16(coeffs[5]); + v_c6 = vdup_n_u16(coeffs[6]); + v_c7 = vdup_n_u16(coeffs[7]); + v_c8 = vdup_n_u16(coeffs[8]); + v_delta = vdupq_n_u32(1 << (xyz_shift - 1)); + } + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint8x8x3_t v_dst; + uint16x8x3_t v_src16; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_src16.val[0] = vmovl_u8(v_src.val[0]); + v_src16.val[1] = vmovl_u8(v_src.val[1]); + v_src16.val[2] = vmovl_u8(v_src.val[2]); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_src16.val[0] = vmovl_u8(v_src.val[0]); + v_src16.val[1] = vmovl_u8(v_src.val[1]); + v_src16.val[2] = vmovl_u8(v_src.val[2]); + } + + uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]), + v_s1 = vget_low_u16(v_src16.val[1]), + v_s2 = vget_low_u16(v_src16.val[2]); + + uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift); + + v_s0 = vget_high_u16(v_src16.val[0]), + v_s1 = vget_high_u16(v_src16.val[1]), + v_s2 = vget_high_u16(v_src16.val[2]); + + uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift); + + v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1))); + v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1))); + v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1))); + + vst3_u8(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift); + int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift); + int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift); + dst[i] = saturate_cast(X); + dst[i+1] = saturate_cast(Y); + dst[i+2] = saturate_cast(Z); + } + } + + int srccn, coeffs[9]; + uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + uint32x4_t v_delta; +}; + +template <> +struct RGB2XYZ_i +{ + typedef ushort channel_type; + + RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const int coeffs0[] = + { + 1689, 1465, 739, + 871, 2929, 296, + 79, 488, 3892 + }; + for( int i = 0; i < 9; i++ ) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = vdup_n_u16(coeffs[0]); + v_c1 = vdup_n_u16(coeffs[1]); + v_c2 = vdup_n_u16(coeffs[2]); + v_c3 = vdup_n_u16(coeffs[3]); + v_c4 = vdup_n_u16(coeffs[4]); + v_c5 = vdup_n_u16(coeffs[5]); + v_c6 = vdup_n_u16(coeffs[6]); + v_c7 = vdup_n_u16(coeffs[7]); + v_c8 = vdup_n_u16(coeffs[8]); + v_delta = vdupq_n_u32(1 << (xyz_shift - 1)); + } + + void operator()(const ushort * src, ushort * dst, int n) const + { + int scn = srccn, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint16x8x3_t v_src, v_dst; + + if (scn == 3) + v_src = vld3q_u16(src); + else + { + uint16x8x4_t v_src4 = vld4q_u16(src); + v_src.val[0] = v_src4.val[0]; + v_src.val[1] = v_src4.val[1]; + v_src.val[2] = v_src4.val[2]; + } + + uint16x4_t v_s0 = vget_low_u16(v_src.val[0]), + v_s1 = vget_low_u16(v_src.val[1]), + v_s2 = vget_low_u16(v_src.val[2]); + + uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift); + + v_s0 = vget_high_u16(v_src.val[0]), + v_s1 = vget_high_u16(v_src.val[1]), + v_s2 = vget_high_u16(v_src.val[2]); + + uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift); + + v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1)); + v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1)); + v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1)); + + vst3q_u16(dst + i, v_dst); + } + + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + uint16x4x3_t v_dst; + uint16x4_t v_s0, v_s1, v_s2; + + if (scn == 3) + { + uint16x4x3_t v_src = vld3_u16(src); + v_s0 = v_src.val[0]; + v_s1 = v_src.val[1]; + v_s2 = v_src.val[2]; + } + else + { + uint16x4x4_t v_src = vld4_u16(src); + v_s0 = v_src.val[0]; + v_s1 = v_src.val[1]; + v_s2 = v_src.val[2]; + } + + uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + + v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift)); + v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift)); + v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift)); + + vst3_u16(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift); + int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift); + int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift); + dst[i] = saturate_cast(X); + dst[i+1] = saturate_cast(Y); + dst[i+2] = saturate_cast(Z); + } + } + + int srccn, coeffs[9]; + uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + uint32x4_t v_delta; +}; + +#endif template struct XYZ2RGB_f { @@ -1141,6 +2220,278 @@ template struct XYZ2RGB_i int coeffs[9]; }; +#if CV_NEON + +template <> +struct XYZ2RGB_i +{ + typedef uchar channel_type; + + XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = + { + 13273, -6296, -2042, + -3970, 7684, 170, + 228, -836, 4331 + }; + for(int i = 0; i < 9; i++) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[6]); + std::swap(coeffs[1], coeffs[7]); + std::swap(coeffs[2], coeffs[8]); + } + + v_c0 = vdup_n_s16(coeffs[0]); + v_c1 = vdup_n_s16(coeffs[1]); + v_c2 = vdup_n_s16(coeffs[2]); + v_c3 = vdup_n_s16(coeffs[3]); + v_c4 = vdup_n_s16(coeffs[4]); + v_c5 = vdup_n_s16(coeffs[5]); + v_c6 = vdup_n_s16(coeffs[6]); + v_c7 = vdup_n_s16(coeffs[7]); + v_c8 = vdup_n_s16(coeffs[8]); + v_delta = vdupq_n_s32(1 << (xyz_shift - 1)); + v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel::max())); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, i = 0; + uchar alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint8x8x3_t v_src = vld3_u8(src + i); + int16x8x3_t v_src16; + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + + int16x4_t v_s0 = vget_low_s16(v_src16.val[0]), + v_s1 = vget_low_s16(v_src16.val[1]), + v_s2 = vget_low_s16(v_src16.val[2]); + + int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift); + + v_s0 = vget_high_s16(v_src16.val[0]), + v_s1 = vget_high_s16(v_src16.val[1]), + v_s2 = vget_high_s16(v_src16.val[2]); + + int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift); + + uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1))); + uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1))); + uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1))); + + if (dcn == 3) + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + vst3_u8(dst, v_dst); + } + else + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift); + int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift); + int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift); + dst[0] = saturate_cast(B); dst[1] = saturate_cast(G); + dst[2] = saturate_cast(R); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[9]; + + int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + uint8x8_t v_alpha; + int32x4_t v_delta; +}; + +template <> +struct XYZ2RGB_i +{ + typedef ushort channel_type; + + XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = + { + 13273, -6296, -2042, + -3970, 7684, 170, + 228, -836, 4331 + }; + for(int i = 0; i < 9; i++) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[6]); + std::swap(coeffs[1], coeffs[7]); + std::swap(coeffs[2], coeffs[8]); + } + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_c4 = vdupq_n_s32(coeffs[4]); + v_c5 = vdupq_n_s32(coeffs[5]); + v_c6 = vdupq_n_s32(coeffs[6]); + v_c7 = vdupq_n_s32(coeffs[7]); + v_c8 = vdupq_n_s32(coeffs[8]); + v_delta = vdupq_n_s32(1 << (xyz_shift - 1)); + v_alpha = vdupq_n_u16(ColorChannel::max()); + v_alpha2 = vget_low_u16(v_alpha); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int dcn = dstcn, i = 0; + ushort alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint16x8x3_t v_src = vld3q_u16(src + i); + int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))), + v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))), + v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); + + int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift); + + v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))); + v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))); + v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); + + int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift); + + uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1)); + uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1)); + uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1)); + + if (dcn == 3) + { + uint16x8x3_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + vst3q_u16(dst, v_dst); + } + else + { + uint16x8x4_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + v_dst.val[3] = v_alpha; + vst4q_u16(dst, v_dst); + } + } + + for ( ; i <= n - 12; i += 12, dst += dcn * 4) + { + uint16x4x3_t v_src = vld3_u16(src + i); + int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])), + v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])), + v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + + int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift); + v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift); + v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift); + + uint16x4_t v_b = vqmovun_s32(v_X); + uint16x4_t v_g = vqmovun_s32(v_Y); + uint16x4_t v_r = vqmovun_s32(v_Z); + + if (dcn == 3) + { + uint16x4x3_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + vst3_u16(dst, v_dst); + } + else + { + uint16x4x4_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + v_dst.val[3] = v_alpha2; + vst4_u16(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift); + int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift); + int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift); + dst[0] = saturate_cast(B); dst[1] = saturate_cast(G); + dst[2] = saturate_cast(R); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[9]; + + int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta; + uint16x4_t v_alpha2; + uint16x8_t v_alpha; +}; + +#endif ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 861b7ae..f4c2cf2 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -3580,14 +3580,6 @@ public: _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); } } - #elif CV_NEON - for( ; x1 <= bcols - 4; x1 += 4 ) - { - int32x4_t v_sx = cv_vrndq_s32_f32(vld1q_f32(sX + x1)), - v_sy = cv_vrndq_s32_f32(vld1q_f32(sY + x1)); - int16x4x2_t v_dst = vzip_s16(vqmovn_s32(v_sx), vqmovn_s32(v_sy)); - vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); - } #endif for( ; x1 < bcols; x1++ ) -- 2.7.4