From 3812ae7949555a37df578c54e619f8e38f171b6a Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Fri, 18 Jan 2019 19:06:29 +0300 Subject: [PATCH] Merge pull request #13649 from savuor:yuv_wide YUV/YCrCb conversions rewritten to wide intrinsics (#13649) * YUV: minors * YUV42x conversions template-merged * more template-merged YUV42x conversions; some NEON code removed * rgb2yuv vectorized * yuv2rgb vectorized * memcpy removed * Yuv2RGB vectorized * unused code removed * rgb2yuv vectorized * rgb2yuv vectorized * v_pack_u used (up to +30% perf) * yuv2rgb vectorized * fixed compilation --- modules/imgproc/src/color_yuv.cpp | 2317 ++++++++++++------------------------- 1 file changed, 709 insertions(+), 1608 deletions(-) diff --git a/modules/imgproc/src/color_yuv.cpp b/modules/imgproc/src/color_yuv.cpp index f12a92e..acc290a 100644 --- a/modules/imgproc/src/color_yuv.cpp +++ b/modules/imgproc/src/color_yuv.cpp @@ -11,33 +11,33 @@ namespace cv //constants for conversion from/to RGB and YUV, YCrCb according to BT.601 //to YCbCr -const float YCBF = 0.564f; // == 1/2/(1-B2YF) -const float YCRF = 0.713f; // == 1/2/(1-R2YF) -const int YCBI = 9241; // == YCBF*16384 -const int YCRI = 11682; // == YCRF*16384 +static const float YCBF = 0.564f; // == 1/2/(1-B2YF) +static const float YCRF = 0.713f; // == 1/2/(1-R2YF) +static const int YCBI = 9241; // == YCBF*16384 +static const int YCRI = 11682; // == YCRF*16384 //to YUV -const float B2UF = 0.492f; -const float R2VF = 0.877f; -const int B2UI = 8061; // == B2UF*16384 -const int R2VI = 14369; // == R2VF*16384 +static const float B2UF = 0.492f; +static const float R2VF = 0.877f; +static const int B2UI = 8061; // == B2UF*16384 +static const int R2VI = 14369; // == R2VF*16384 //from YUV -const float U2BF = 2.032f; -const float U2GF = -0.395f; -const float V2GF = -0.581f; -const float V2RF = 1.140f; -const int U2BI = 33292; -const int U2GI = -6472; -const int V2GI = -9519; -const int V2RI = 18678; +static const float U2BF = 2.032f; +static const float U2GF = -0.395f; +static const float V2GF = -0.581f; +static const float V2RF = 1.140f; +static const int U2BI = 33292; +static const int U2GI = -6472; +static const int V2GI = -9519; +static const int V2RI = 18678; //from YCrCb -const float CB2BF = 1.773f; -const float CB2GF = -0.344f; -const float CR2GF = -0.714f; -const float CR2RF = 1.403f; -const int CB2BI = 29049; -const int CB2GI = -5636; -const int CR2GI = -11698; -const int CR2RI = 22987; +static const float CB2BF = 1.773f; +static const float CB2GF = -0.344f; +static const float CR2GF = -0.714f; +static const float CR2RF = 1.403f; +static const int CB2BI = 29049; +static const int CB2GI = -5636; +static const int CR2GI = -11698; +static const int CR2RI = 22987; ///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// @@ -45,12 +45,17 @@ template struct RGB2YCrCb_f { typedef _Tp channel_type; - RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) + RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : + srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); - if(blueIdx==0) std::swap(coeffs[0], coeffs[2]); + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); } void operator()(const _Tp* src, _Tp* dst, int n) const @@ -73,8 +78,6 @@ template struct RGB2YCrCb_f float coeffs[5]; }; -#if CV_NEON - template <> struct RGB2YCrCb_f { @@ -85,179 +88,92 @@ struct RGB2YCrCb_f { static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); - if(blueIdx==0) + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx == 0) std::swap(coeffs[0], coeffs[2]); - - v_c0 = vdupq_n_f32(coeffs[0]); - v_c1 = vdupq_n_f32(coeffs[1]); - v_c2 = vdupq_n_f32(coeffs[2]); - v_c3 = vdupq_n_f32(coeffs[3]); - v_c4 = vdupq_n_f32(coeffs[4]); - v_delta = vdupq_n_f32(ColorChannel::half()); } void operator()(const float * src, float * dst, int n) const { - int scn = srccn, bidx = blueIdx, i = 0; + int scn = srccn, bidx = blueIdx; int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb const float delta = ColorChannel::half(); float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - n *= 3; - if (scn == 3) - for ( ; i <= n - 12; i += 12, src += 12) + int i = 0; +#if CV_SIMD + v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); + v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4); + v_float32 vdelta = vx_setall_f32(delta); + const int vsize = v_float32::nlanes; + for( ; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*3) + { + v_float32 b, g, r, dummy; + if(scn == 3) { - float32x4x3_t v_src = vld3q_f32(src), v_dst; - v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); - v_dst.val[1+yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3); - v_dst.val[2-yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4); - - vst3q_f32(dst + i, v_dst); + v_load_deinterleave(src, b, g, r); } - else - for ( ; i <= n - 12; i += 12, src += 16) + else { - float32x4x4_t v_src = vld4q_f32(src); - float32x4x3_t v_dst; - v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); - v_dst.val[1+yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3); - v_dst.val[2-yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4); - - vst3q_f32(dst + i, v_dst); + v_load_deinterleave(src, b, g, r, dummy); } - for ( ; i < n; i += 3, src += scn) - { - float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; - float Cr = (src[bidx^2] - Y)*C3 + delta; - float Cb = (src[bidx] - Y)*C4 + delta; - dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb; - } - } - int srccn, blueIdx; - bool isCrCb; - float coeffs[5]; - float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; -}; + v_float32 y, cr, cb; + y = b*vc0 + g*vc1 + r*vc2; -#elif CV_SSE2 + if(bidx) + std::swap(r, b); -template <> -struct RGB2YCrCb_f -{ - typedef float channel_type; + cr = v_fma(r - y, vc3, vdelta); + cb = v_fma(b - y, vc4, vdelta); - RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : - srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; - static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); - if (blueIdx==0) - std::swap(coeffs[0], coeffs[2]); - - v_c0 = _mm_set1_ps(coeffs[0]); - v_c1 = _mm_set1_ps(coeffs[1]); - v_c2 = _mm_set1_ps(coeffs[2]); - v_c3 = _mm_set1_ps(coeffs[3]); - v_c4 = _mm_set1_ps(coeffs[4]); - v_delta = _mm_set1_ps(ColorChannel::half()); - - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - } - - void process(__m128 v_r, __m128 v_g, __m128 v_b, - __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const - { - v_y = _mm_mul_ps(v_r, v_c0); - v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1)); - v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2)); - - v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta); - v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta); - } - - void operator()(const float * src, float * dst, int n) const - { - int scn = srccn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const float delta = ColorChannel::half(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - n *= 3; - - if (haveSIMD) - { - for ( ; i <= n - 24; i += 24, src += 8 * scn) + if(yuvOrder) { - __m128 v_r0 = _mm_loadu_ps(src); - __m128 v_r1 = _mm_loadu_ps(src + 4); - __m128 v_g0 = _mm_loadu_ps(src + 8); - __m128 v_g1 = _mm_loadu_ps(src + 12); - __m128 v_b0 = _mm_loadu_ps(src + 16); - __m128 v_b1 = _mm_loadu_ps(src + 20); - - if (scn == 4) - { - __m128 v_a0 = _mm_loadu_ps(src + 24); - __m128 v_a1 = _mm_loadu_ps(src + 28); - _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, - v_b0, v_b1, v_a0, v_a1); - } - else - _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - - __m128 v_y0, v_cr0, v_cb0; - process(v_r0, v_g0, v_b0, - v_y0, v_cr0, v_cb0); - - __m128 v_y1, v_cr1, v_cb1; - process(v_r1, v_g1, v_b1, - v_y1, v_cr1, v_cb1); - - if(isCrCb) - _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); - else //YUV - { - _mm_interleave_ps(v_y0, v_y1, v_cb0, v_cb1, v_cr0, v_cr1); - } - - _mm_storeu_ps(dst + i, v_y0); - _mm_storeu_ps(dst + i + 4, v_y1); - _mm_storeu_ps(dst + i + 8 + yuvOrder*8, v_cr0); - _mm_storeu_ps(dst + i + 12 + yuvOrder*8, v_cr1); - _mm_storeu_ps(dst + i + 16 - yuvOrder*8, v_cb0); - _mm_storeu_ps(dst + i + 20 - yuvOrder*8, v_cb1); + v_store_interleave(dst, y, cb, cr); + } + else + { + v_store_interleave(dst, y, cr, cb); } } - - for ( ; i < n; i += 3, src += scn) + vx_cleanup(); +#endif + for ( ; i < n; i ++, src += scn, dst += 3) { float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; float Cr = (src[bidx^2] - Y)*C3 + delta; float Cb = (src[bidx] - Y)*C4 + delta; - dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb; + dst[0 ] = Y; + dst[1+yuvOrder] = Cr; + dst[2-yuvOrder] = Cb; } } + int srccn, blueIdx; bool isCrCb; float coeffs[5]; - __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; - bool haveSIMD; }; -#endif template struct RGB2YCrCb_i { typedef _Tp channel_type; + static const int shift = yuv_shift; RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); + + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } if(blueIdx==0) std::swap(coeffs[0], coeffs[2]); } void operator()(const _Tp* src, _Tp* dst, int n) const @@ -265,13 +181,13 @@ template struct RGB2YCrCb_i int scn = srccn, bidx = blueIdx; int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift); + int delta = ColorChannel<_Tp>::half()*(1 << shift); n *= 3; for(int i = 0; i < n; i += 3, src += scn) { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); dst[i] = saturate_cast<_Tp>(Y); dst[i+1+yuvOrder] = saturate_cast<_Tp>(Cr); dst[i+2-yuvOrder] = saturate_cast<_Tp>(Cb); @@ -282,302 +198,167 @@ template struct RGB2YCrCb_i int coeffs[5]; }; -#if CV_NEON -template <> -struct RGB2YCrCb_i +template<> +struct RGB2YCrCb_i { - typedef uchar channel_type; + typedef ushort channel_type; + static const int shift = yuv_shift; + static const int fix_shift = (int)(sizeof(short)*8 - shift); RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); - if (blueIdx==0) - std::swap(coeffs[0], coeffs[2]); - v_c0 = vdup_n_s16(coeffs[0]); - v_c1 = vdup_n_s16(coeffs[1]); - v_c2 = vdup_n_s16(coeffs[2]); - v_c3 = vdupq_n_s32(coeffs[3]); - v_c4 = vdupq_n_s32(coeffs[4]); - v_delta = vdupq_n_s32(ColorChannel::half()*(1 << yuv_shift)); - v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } + if(blueIdx==0) + std::swap(coeffs[0], coeffs[2]); } - void operator()(const uchar * src, uchar * dst, int n) const + void operator()(const ushort* src, ushort* dst, int n) const { - int scn = srccn, bidx = blueIdx, i = 0; + int scn = srccn, bidx = blueIdx; int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel::half()*(1 << yuv_shift); - n *= 3; - - for ( ; i <= n - 24; i += 24, src += scn * 8) + int sdelta = ColorChannel::half()*(1 << shift); + int i = 0; +#if CV_SIMD + const int vsize = v_uint16::nlanes; + const int descale = 1 << (shift-1); + + v_int16 b2y = vx_setall_s16((short)C0); + v_int16 g2y = vx_setall_s16((short)C1); + v_int16 r2y = vx_setall_s16((short)C2); + v_int16 one = vx_setall_s16(1); + v_int16 z = vx_setzero_s16(); + + v_int16 bg2y, r12y; + v_int16 dummy; + v_zip(b2y, g2y, bg2y, dummy); + v_zip(r2y, one, r12y, dummy); + + v_int16 vdescale = vx_setall_s16(1 << (shift-1)); + v_int32 vc3 = vx_setall_s32(C3); + v_int32 vc4 = vx_setall_s32(C4); + v_int32 vdd = vx_setall_s32(sdelta + descale); + + for(; i <= n-vsize; + i += vsize, src += vsize*scn, dst += vsize*3) { - uint8x8x3_t v_dst; - int16x8x3_t v_src16; - - if (scn == 3) + v_uint16 r, g, b, a; + if(scn == 3) { - uint8x8x3_t v_src = vld3_u8(src); - v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); - v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); - v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + v_load_deinterleave(src, b, g, r); } else { - uint8x8x4_t v_src = vld4_u8(src); - v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); - v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); - v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + v_load_deinterleave(src, b, g, r, a); } - int16x4x3_t v_src0; - v_src0.val[0] = vget_low_s16(v_src16.val[0]); - v_src0.val[1] = vget_low_s16(v_src16.val[1]); - v_src0.val[2] = vget_low_s16(v_src16.val[2]); - - int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); - v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift); - int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3); - v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift); - int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4); - v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift); - - v_src0.val[0] = vget_high_s16(v_src16.val[0]); - v_src0.val[1] = vget_high_s16(v_src16.val[1]); - v_src0.val[2] = vget_high_s16(v_src16.val[2]); - - int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); - v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift); - int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3); - v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift); - int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4); - v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift); - - v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1))); - v_dst.val[1+yuvOrder] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1))); - v_dst.val[2-yuvOrder] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1))); - - vst3_u8(dst + i, v_dst); - } + v_uint16 y, cr, cb; - for ( ; i < n; i += 3, src += scn) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); - dst[i] = saturate_cast(Y); - dst[i+1+yuvOrder] = saturate_cast(Cr); - dst[i+2-yuvOrder] = saturate_cast(Cb); - } - } - int srccn, blueIdx, coeffs[5]; - bool isCrCb; - int16x4_t v_c0, v_c1, v_c2; - int32x4_t v_c3, v_c4, v_delta, v_delta2; -}; + v_int16 sb = v_reinterpret_as_s16(b); + v_int16 sr = v_reinterpret_as_s16(r); + v_int16 sg = v_reinterpret_as_s16(g); -template <> -struct RGB2YCrCb_i -{ - typedef ushort channel_type; + v_int16 bg0, bg1; + v_int16 rd0, rd1; + v_zip(sb, sg, bg0, bg1); + v_zip(sr, vdescale, rd0, rd1); - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); - if (blueIdx==0) - std::swap(coeffs[0], coeffs[2]); + // fixing 16bit signed multiplication + v_int16 mr, mg, mb; + mr = (sr < z) & r2y; + mg = (sg < z) & g2y; + mb = (sb < z) & b2y; + v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; - v_c0 = vdupq_n_s32(coeffs[0]); - v_c1 = vdupq_n_s32(coeffs[1]); - v_c2 = vdupq_n_s32(coeffs[2]); - v_c3 = vdupq_n_s32(coeffs[3]); - v_c4 = vdupq_n_s32(coeffs[4]); - v_delta = vdupq_n_s32(ColorChannel::half()*(1 << yuv_shift)); - v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); - } + v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; + v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; - void operator()(const ushort * src, ushort * dst, int n) const - { - int scn = srccn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel::half()*(1 << yuv_shift); - n *= 3; + y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul)); - for ( ; i <= n - 24; i += 24, src += scn * 8) - { - uint16x8x3_t v_src, v_dst; - int32x4x3_t v_src0; + if(bidx) + swap(r, b); - if (scn == 3) - v_src = vld3q_u16(src); - else - { - uint16x8x4_t v_src_ = vld4q_u16(src); - v_src.val[0] = v_src_.val[0]; - v_src.val[1] = v_src_.val[1]; - v_src.val[2] = v_src_.val[2]; - } + // (r-Y) and (b-Y) don't fit into int16 or uint16 range + v_uint32 r0, r1, b0, b1; + v_expand(r, r0, r1); + v_expand(b, b0, b1); - v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))); - v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))); - v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); - - int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); - v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift); - int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3); - v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift); - int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4); - v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift); - - v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))); - v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))); - v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); - - int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); - v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift); - int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3); - v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift); - int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4); - v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift); - - v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1)); - v_dst.val[1+yuvOrder] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1)); - v_dst.val[2-yuvOrder] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1)); - - vst3q_u16(dst + i, v_dst); - } + v_uint32 uy0, uy1; + v_expand(y, uy0, uy1); - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - uint16x4x3_t v_dst; - int32x4x3_t v_src0; + v_int32 sr0 = v_reinterpret_as_s32(r0); + v_int32 sr1 = v_reinterpret_as_s32(r1); + v_int32 sb0 = v_reinterpret_as_s32(b0); + v_int32 sb1 = v_reinterpret_as_s32(b1); + v_int32 sy0 = v_reinterpret_as_s32(uy0); + v_int32 sy1 = v_reinterpret_as_s32(uy1); + + sr0 = sr0 - sy0; sr1 = sr1 - sy1; + sb0 = sb0 - sy0; sb1 = sb1 - sy1; - if (scn == 3) + v_int32 scr0, scr1, scb0, scb1; + + scr0 = (sr0*vc3 + vdd) >> shift; + scr1 = (sr1*vc3 + vdd) >> shift; + scb0 = (sb0*vc4 + vdd) >> shift; + scb1 = (sb1*vc4 + vdd) >> shift; + + // saturate and pack + cr = v_pack_u(scr0, scr1); + cb = v_pack_u(scb0, scb1); + + if(yuvOrder) { - uint16x4x3_t v_src = vld3_u16(src); - v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])); - v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])); - v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + v_store_interleave(dst, y, cb, cr); } else { - uint16x4x4_t v_src = vld4_u16(src); - v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])); - v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])); - v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + v_store_interleave(dst, y, cr, cb); } - - int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); - v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift); - int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3); - v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift); - int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4); - v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift); - - v_dst.val[0] = vqmovun_s32(v_Y); - v_dst.val[1+yuvOrder] = vqmovun_s32(v_Cr); - v_dst.val[2-yuvOrder] = vqmovun_s32(v_Cb); - - vst3_u16(dst + i, v_dst); } - - for ( ; i < n; i += 3, src += scn) + vx_cleanup(); +#endif + for( ; i < n; i++, src += scn, dst += 3) { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); - dst[i] = saturate_cast(Y); - dst[i+1+yuvOrder] = saturate_cast(Cr); - dst[i+2-yuvOrder] = saturate_cast(Cb); + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + sdelta, shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + sdelta, shift); + dst[0] = saturate_cast(Y); + dst[1+yuvOrder] = saturate_cast(Cr); + dst[2-yuvOrder] = saturate_cast(Cb); } } - int srccn, blueIdx, coeffs[5]; + int srccn, blueIdx; bool isCrCb; - int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; + int coeffs[5]; }; -#elif CV_SSE4_1 template <> struct RGB2YCrCb_i { typedef uchar channel_type; + static const int shift = yuv_shift; RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); + for(int i = 0; i < 5; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } if (blueIdx==0) std::swap(coeffs[0], coeffs[2]); - - short delta = 1 << (yuv_shift - 1); - v_delta_16 = _mm_set1_epi16(delta); - v_delta_32 = _mm_set1_epi32(delta); - short delta2 = 1 + ColorChannel::half() * 2; - v_coeff = _mm_set_epi16(delta2, (short)coeffs[4], delta2, (short)coeffs[3], delta2, (short)coeffs[4], delta2, (short)coeffs[3]); - if(isCrCb) - v_shuffle2 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xf, 0xe, 0xc, 0xb, 0xa, 0x8, 0x7, 0x6, 0x4, 0x3, 0x2, 0x0); - else //if YUV - v_shuffle2 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xe, 0xf, 0xc, 0xa, 0xb, 0x8, 0x6, 0x7, 0x4, 0x2, 0x3, 0x0); - haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); - } - - // 16u x 8 - void process(__m128i* v_rgb, __m128i & v_crgb, - __m128i* v_rb, uchar * dst) const - { - v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_crgb); - v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_crgb); - v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_crgb); - v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_crgb); - v_rgb[0] = _mm_hadd_epi32(v_rgb[0], v_rgb[1]); - v_rgb[2] = _mm_hadd_epi32(v_rgb[2], v_rgb[3]); - v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta_32); - v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta_32); - v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift); - v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift); - __m128i v_y = _mm_packs_epi32(v_rgb[0], v_rgb[2]); - - v_rb[0] = _mm_cvtepu8_epi16(v_rb[0]); - v_rb[1] = _mm_cvtepu8_epi16(v_rb[1]); - v_rb[0] = _mm_sub_epi16(v_rb[0], _mm_unpacklo_epi16(v_y, v_y)); - v_rb[1] = _mm_sub_epi16(v_rb[1], _mm_unpackhi_epi16(v_y, v_y)); - v_rgb[0] = _mm_unpacklo_epi16(v_rb[0], v_delta_16); - v_rgb[1] = _mm_unpackhi_epi16(v_rb[0], v_delta_16); - v_rgb[2] = _mm_unpacklo_epi16(v_rb[1], v_delta_16); - v_rgb[3] = _mm_unpackhi_epi16(v_rb[1], v_delta_16); - v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeff); - v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeff); - v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeff); - v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeff); - v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift); - v_rgb[1] = _mm_srai_epi32(v_rgb[1], yuv_shift); - v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift); - v_rgb[3] = _mm_srai_epi32(v_rgb[3], yuv_shift); - v_rgb[0] = _mm_packs_epi32(v_rgb[0], v_rgb[1]); - v_rgb[2] = _mm_packs_epi32(v_rgb[2], v_rgb[3]); - v_rgb[0] = _mm_packus_epi16(v_rgb[0], v_rgb[2]); - - v_rb[0] = _mm_unpacklo_epi16(v_y, v_rgb[0]); - v_rb[1] = _mm_unpackhi_epi16(v_y, v_rgb[0]); - - v_rb[0] = _mm_shuffle_epi8(v_rb[0], v_shuffle2); - v_rb[1] = _mm_shuffle_epi8(v_rb[1], v_shuffle2); - v_rb[1] = _mm_alignr_epi8(v_rb[1], _mm_slli_si128(v_rb[0], 4), 12); - - _mm_storel_epi64((__m128i *)(dst), v_rb[0]); - _mm_storeu_si128((__m128i *)(dst + 8), v_rb[1]); } void operator()(const uchar * src, uchar * dst, int n) const @@ -585,230 +366,157 @@ struct RGB2YCrCb_i int scn = srccn, bidx = blueIdx, i = 0; int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel::half()*(1 << yuv_shift); - n *= 3; - - if (haveSIMD) + int delta = ColorChannel::half()*(1 << shift); + +#if CV_SIMD + const int vsize = v_uint8::nlanes; + const int descaleShift = 1 << (shift-1); + v_int16 bg2y; + v_int16 r12y; + v_int16 dummy; + v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), bg2y, dummy); + v_zip(vx_setall_s16((short)C2), vx_setall_s16( 1), r12y, dummy); + + // delta + descaleShift == descaleShift*(half*2+1) + v_int16 c3h, c4h; + const short h21 = (short)(ColorChannel::half()*2+1); + v_zip(vx_setall_s16((short)C3), vx_setall_s16(h21), c3h, dummy); + v_zip(vx_setall_s16((short)C4), vx_setall_s16(h21), c4h, dummy); + + v_int16 vdescale = vx_setall_s16(descaleShift); + + for( ; i <= n-vsize; + i += vsize, src += scn*vsize, dst += 3*vsize) { - __m128i v_shuffle; - __m128i v_crgb; - if (scn == 4) + v_uint8 r, g, b, a; + if(scn == 3) { - if (bidx == 0) - { - v_shuffle = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc, 0xe, 0x8, 0xa, 0x4, 0x6, 0x0, 0x2); - } - else - { - v_shuffle = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xe, 0xc, 0xa, 0x8, 0x6, 0x4, 0x2, 0x0); - } - v_crgb = _mm_set_epi16(0, (short)C2, (short)C1, (short)C0, 0, (short)C2, (short)C1, (short)C0); - for ( ; i <= n - 24; i += 24, src += scn * 8) - { - __m128i v_src[2]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src)); - v_src[1] = _mm_loadu_si128((__m128i const *)(src + 16)); + v_load_deinterleave(src, b, g, r); + } + else + { + v_load_deinterleave(src, b, g, r, a); + } - __m128i v_rgb[4]; - v_rgb[0] = _mm_cvtepu8_epi16(v_src[0]); - v_rgb[1] = _mm_cvtepu8_epi16(_mm_srli_si128(v_src[0], 8)); - v_rgb[2] = _mm_cvtepu8_epi16(v_src[1]); - v_rgb[3] = _mm_cvtepu8_epi16(_mm_srli_si128(v_src[1], 8)); + v_uint8 y; - __m128i v_rb[2]; - v_rb[0] = _mm_shuffle_epi8(v_src[0], v_shuffle); - v_rb[1] = _mm_shuffle_epi8(v_src[1], v_shuffle); + v_uint16 r0, r1, g0, g1, b0, b1; + v_expand(r, r0, r1); + v_expand(g, g0, g1); + v_expand(b, b0, b1); - process(v_rgb, v_crgb, v_rb, dst + i); - } - } - else + v_int16 sr0, sr1, sg0, sg1, sb0, sb1; + sr0 = v_reinterpret_as_s16(r0); sr1 = v_reinterpret_as_s16(r1); + sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1); + sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1); + + v_uint32 y00, y01, y10, y11; { - if (bidx == 0) - { - v_shuffle = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xb, 0x6, 0x8, 0x3, 0x5, 0x0, 0x2); - } - else - { - v_shuffle = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xb, 0x9, 0x8, 0x6, 0x5, 0x3, 0x2, 0x0); - } - v_crgb = _mm_set_epi16(0, (short)C2, (short)C1, (short)C0, (short)C2, (short)C1, (short)C0, 0); - for ( ; i <= n - 24; i += 24, src += scn * 8) - { - __m128i v_src[2]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src)); - v_src[1] = _mm_loadl_epi64((__m128i const *)(src + 16)); + v_int16 bg00, bg01, bg10, bg11; + v_int16 rd00, rd01, rd10, rd11; + v_zip(sb0, sg0, bg00, bg01); + v_zip(sb1, sg1, bg10, bg11); + v_zip(sr0, vdescale, rd00, rd01); + v_zip(sr1, vdescale, rd10, rd11); + + y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; + y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; + y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; + y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; + } - __m128i v_rgb[4]; - v_rgb[0] = _mm_cvtepu8_epi16(_mm_slli_si128(v_src[0], 1)); - v_rgb[1] = _mm_cvtepu8_epi16(_mm_srli_si128(v_src[0], 5)); - v_rgb[2] = _mm_cvtepu8_epi16(_mm_alignr_epi8(v_src[1], v_src[0], 11)); - v_rgb[3] = _mm_cvtepu8_epi16(_mm_srli_si128(v_src[1], 1)); + v_uint16 y0, y1; + y0 = v_pack(y00, y01); + y1 = v_pack(y10, y11); - __m128i v_rb[2]; - v_rb[0] = _mm_shuffle_epi8(v_src[0], v_shuffle); - v_rb[1] = _mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 12), v_shuffle); + y = v_pack(y0, y1); - process(v_rgb, v_crgb, v_rb, dst + i); - } + v_int16 sy0, sy1; + sy0 = v_reinterpret_as_s16(y0); + sy1 = v_reinterpret_as_s16(y1); + + // (r-Y) and (b-Y) don't fit into 8 bit, use 16 bits instead + sr0 = v_sub_wrap(sr0, sy0); + sr1 = v_sub_wrap(sr1, sy1); + sb0 = v_sub_wrap(sb0, sy0); + sb1 = v_sub_wrap(sb1, sy1); + + if(bidx) + { + swap(sr0, sb0); swap(sr1, sb1); } - } - for ( ; i < n; i += 3, src += scn) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); - dst[i] = saturate_cast(Y); - dst[i+1+yuvOrder] = saturate_cast(Cr); - dst[i+2-yuvOrder] = saturate_cast(Cb); - } - } + v_uint32 cr00, cr01, cr10, cr11; + v_uint32 cb00, cb01, cb10, cb11; - __m128i v_delta_16, v_delta_32; - __m128i v_coeff; - __m128i v_shuffle2; - int srccn, blueIdx, coeffs[5]; - bool isCrCb; - bool haveSIMD; -}; + // delta + descaleShift == descaleShift*(half*2+1) + { + v_int16 rd00, rd01, rd10, rd11; + v_int16 bd00, bd01, bd10, bd11; -template <> -struct RGB2YCrCb_i -{ - typedef ushort channel_type; + v_zip(sr0, vdescale, rd00, rd01); + v_zip(sr1, vdescale, rd10, rd11); - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0])); - if (blueIdx==0) - std::swap(coeffs[0], coeffs[2]); + v_zip(sb0, vdescale, bd00, bd01); + v_zip(sb1, vdescale, bd10, bd11); - v_c0 = _mm_set1_epi32(coeffs[0]); - v_c1 = _mm_set1_epi32(coeffs[1]); - v_c2 = _mm_set1_epi32(coeffs[2]); - v_c3 = _mm_set1_epi32(coeffs[3]); - v_c4 = _mm_set1_epi32(coeffs[4]); - v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); - v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); - v_delta = _mm_add_epi32(v_delta, v_delta2); - v_zero = _mm_setzero_si128(); - - haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); - } + cr00 = v_reinterpret_as_u32(v_dotprod(rd00, c3h)); + cr01 = v_reinterpret_as_u32(v_dotprod(rd01, c3h)); + cr10 = v_reinterpret_as_u32(v_dotprod(rd10, c3h)); + cr11 = v_reinterpret_as_u32(v_dotprod(rd11, c3h)); - // 16u x 8 - void process(__m128i v_r, __m128i v_g, __m128i v_b, - __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const - { - __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero); - __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero); - __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); - - __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), - _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), - _mm_mullo_epi32(v_b_p, v_c2))); - v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); - - __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3); - __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4); - v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift); - v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift); - - v_r_p = _mm_unpackhi_epi16(v_r, v_zero); - v_g_p = _mm_unpackhi_epi16(v_g, v_zero); - v_b_p = _mm_unpackhi_epi16(v_b, v_zero); - - __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), - _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), - _mm_mullo_epi32(v_b_p, v_c2))); - v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift); - - __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3); - __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4); - v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift); - v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift); - - v_y = _mm_packus_epi32(v_y0, v_y1); - v_cr = _mm_packus_epi32(v_cr0, v_cr1); - v_cb = _mm_packus_epi32(v_cb0, v_cb1); - } + cb00 = v_reinterpret_as_u32(v_dotprod(bd00, c4h)); + cb01 = v_reinterpret_as_u32(v_dotprod(bd01, c4h)); + cb10 = v_reinterpret_as_u32(v_dotprod(bd10, c4h)); + cb11 = v_reinterpret_as_u32(v_dotprod(bd11, c4h)); + } - void operator()(const ushort * src, ushort * dst, int n) const - { - int scn = srccn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel::half()*(1 << yuv_shift); - n *= 3; + v_uint8 cr, cb; - if (haveSIMD) - { - for ( ; i <= n - 48; i += 48, src += scn * 16) - { - __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); - __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); - __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); - __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); - __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); - __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); - - if (scn == 4) - { - __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); - __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); + cr00 = cr00 >> shift; + cr01 = cr01 >> shift; + cr10 = cr10 >> shift; + cr11 = cr11 >> shift; - _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, - v_b0, v_b1, v_a0, v_a1); - } - else - _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - - __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; - process(v_r0, v_g0, v_b0, - v_y0, v_cr0, v_cb0); - - __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero; - process(v_r1, v_g1, v_b1, - v_y1, v_cr1, v_cb1); - - if(isCrCb) - _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); - else //YUV - _mm_interleave_epi16(v_y0, v_y1, v_cb0, v_cb1, v_cr0, v_cr1); - - _mm_storeu_si128((__m128i *)(dst + i), v_y0); - _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1); - _mm_storeu_si128((__m128i *)(dst + i + 16 + yuvOrder*16), v_cr0); - _mm_storeu_si128((__m128i *)(dst + i + 24 + yuvOrder*16), v_cr1); - _mm_storeu_si128((__m128i *)(dst + i + 32 - yuvOrder*16), v_cb0); - _mm_storeu_si128((__m128i *)(dst + i + 40 - yuvOrder*16), v_cb1); + cb00 = cb00 >> shift; + cb01 = cb01 >> shift; + cb10 = cb10 >> shift; + cb11 = cb11 >> shift; + + v_uint16 cr0, cr1, cb0, cb1; + cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11); + cb0 = v_pack(cb00, cb01); cb1 = v_pack(cb10, cb11); + + cr = v_pack(cr0, cr1); + cb = v_pack(cb0, cb1); + + if(yuvOrder) + { + v_store_interleave(dst, y, cb, cr); + } + else + { + v_store_interleave(dst, y, cr, cb); } } + vx_cleanup(); +#endif - for ( ; i < n; i += 3, src += scn) + for ( ; i < n; i++, src += scn, dst += 3) { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); - dst[i] = saturate_cast(Y); - dst[i+1+yuvOrder] = saturate_cast(Cr); - dst[i+2-yuvOrder] = saturate_cast(Cb); + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); + dst[0] = saturate_cast(Y); + dst[1+yuvOrder] = saturate_cast(Cr); + dst[2-yuvOrder] = saturate_cast(Cb); } } int srccn, blueIdx, coeffs[5]; bool isCrCb; - __m128i v_c0, v_c1, v_c2; - __m128i v_c3, v_c4, v_delta, v_delta2; - __m128i v_zero; - bool haveSIMD; }; -#endif // CV_SSE4_1 template struct YCrCb2RGB_f { @@ -819,7 +527,10 @@ template struct YCrCb2RGB_f { static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - memcpy(coeffs, isCrCb ? coeffs_cbr : coeffs_yuv, 4*sizeof(coeffs[0])); + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; + } } void operator()(const _Tp* src, _Tp* dst, int n) const { @@ -848,9 +559,8 @@ template struct YCrCb2RGB_f float coeffs[4]; }; -#if CV_NEON -template <> +template<> struct YCrCb2RGB_f { typedef float channel_type; @@ -860,170 +570,57 @@ struct YCrCb2RGB_f { static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - memcpy(coeffs, isCrCb ? coeffs_cbr : coeffs_yuv, 4*sizeof(coeffs[0])); - - v_c0 = vdupq_n_f32(coeffs[0]); - v_c1 = vdupq_n_f32(coeffs[1]); - v_c2 = vdupq_n_f32(coeffs[2]); - v_c3 = vdupq_n_f32(coeffs[3]); - v_delta = vdupq_n_f32(ColorChannel::half()); - v_alpha = vdupq_n_f32(ColorChannel::max()); - } - - void operator()(const float* src, float* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const float delta = ColorChannel::half(), alpha = ColorChannel::max(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - - if (dcn == 3) - for ( ; i <= n - 12; i += 12, dst += 12) - { - float32x4x3_t v_src = vld3q_f32(src + i), v_dst; - float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1+yuvOrder], v_Cb = v_src.val[2-yuvOrder]; - - v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3); - v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y); - v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0); - - vst3q_f32(dst, v_dst); - } - else - for ( ; i <= n - 12; i += 12, dst += 16) - { - float32x4x3_t v_src = vld3q_f32(src + i); - float32x4x4_t v_dst; - float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1+yuvOrder], v_Cb = v_src.val[2-yuvOrder]; - - v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3); - v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y); - v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0); - v_dst.val[3] = v_alpha; - - vst4q_f32(dst, v_dst); - } - - for ( ; i < n; i += 3, dst += dcn) + for(int i = 0; i < 4; i++) { - float Y = src[i], Cr = src[i+1+yuvOrder], Cb = src[i+2-yuvOrder]; - - float b = Y + (Cb - delta)*C3; - float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; - float r = Y + (Cr - delta)*C0; - - dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; + coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; } } - int dstcn, blueIdx; - bool isCrCb; - float coeffs[4]; - float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; -}; - -#elif CV_SSE2 - -template <> -struct YCrCb2RGB_f -{ - typedef float channel_type; - - YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; - static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - memcpy(coeffs, isCrCb ? coeffs_cbr : coeffs_yuv, 4*sizeof(coeffs[0])); - - v_c0 = _mm_set1_ps(coeffs[0]); - v_c1 = _mm_set1_ps(coeffs[1]); - v_c2 = _mm_set1_ps(coeffs[2]); - v_c3 = _mm_set1_ps(coeffs[3]); - v_delta = _mm_set1_ps(ColorChannel::half()); - v_alpha = _mm_set1_ps(ColorChannel::max()); - - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - } - - void process(__m128 v_y, __m128 v_cr, __m128 v_cb, - __m128 & v_r, __m128 & v_g, __m128 & v_b) const - { - v_cb = _mm_sub_ps(v_cb, v_delta); - v_cr = _mm_sub_ps(v_cr, v_delta); - - if (!isCrCb) - std::swap(v_cb, v_cr); - - v_b = _mm_mul_ps(v_cb, v_c3); - v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1)); - v_r = _mm_mul_ps(v_cr, v_c0); - - v_b = _mm_add_ps(v_b, v_y); - v_g = _mm_add_ps(v_g, v_y); - v_r = _mm_add_ps(v_r, v_y); - - if (blueIdx == 0) - std::swap(v_b, v_r); - } void operator()(const float* src, float* dst, int n) const { - int dcn = dstcn, bidx = blueIdx, i = 0; + int dcn = dstcn, bidx = blueIdx; int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb const float delta = ColorChannel::half(), alpha = ColorChannel::max(); float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - - if (haveSIMD) - { - for ( ; i <= n - 24; i += 24, dst += 8 * dcn) - { - __m128 v_y0 = _mm_loadu_ps(src + i); - __m128 v_y1 = _mm_loadu_ps(src + i + 4); - __m128 v_cr0 = _mm_loadu_ps(src + i + 8); - __m128 v_cr1 = _mm_loadu_ps(src + i + 12); - __m128 v_cb0 = _mm_loadu_ps(src + i + 16); - __m128 v_cb1 = _mm_loadu_ps(src + i + 20); - - _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); - - __m128 v_r0, v_g0, v_b0; - process(v_y0, v_cr0, v_cb0, - v_r0, v_g0, v_b0); - __m128 v_r1, v_g1, v_b1; - process(v_y1, v_cr1, v_cb1, - v_r1, v_g1, v_b1); + int i = 0; +#if CV_SIMD + v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1); + v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3); + v_float32 vdelta = vx_setall_f32(delta); + v_float32 valpha = vx_setall_f32(alpha); + const int vsize = v_float32::nlanes; + for( ; i <= n-vsize; + i += vsize, src += vsize*3, dst += vsize*dcn) + { + v_float32 y, cr, cb; + if(yuvOrder) + v_load_deinterleave(src, y, cb, cr); + else + v_load_deinterleave(src, y, cr, cb); - __m128 v_a0 = v_alpha, v_a1 = v_alpha; + v_float32 b, g, r; - if (dcn == 3) - _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - else - _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, - v_b0, v_b1, v_a0, v_a1); + cb -= vdelta; cr -= vdelta; + b = v_fma(cb, vc3, y); + g = v_fma(cr, vc1, v_fma(cb, vc2, y)); + r = v_fma(cr, vc0, y); - _mm_storeu_ps(dst, v_r0); - _mm_storeu_ps(dst + 4, v_r1); - _mm_storeu_ps(dst + 8, v_g0); - _mm_storeu_ps(dst + 12, v_g1); - _mm_storeu_ps(dst + 16, v_b0); - _mm_storeu_ps(dst + 20, v_b1); + if(bidx) + swap(r, b); - if (dcn == 4) - { - _mm_storeu_ps(dst + 24, v_a0); - _mm_storeu_ps(dst + 28, v_a1); - } - } + if(dcn == 3) + v_store_interleave(dst, b, g, r); + else + v_store_interleave(dst, b, g, r, valpha); } - - for ( ; i < n; i += 3, dst += dcn) + vx_cleanup(); +#endif + for(; i < n; i++, src += 3, dst += dcn) { - float Y = src[i], Cr = src[i+1+yuvOrder], Cb = src[i+2-yuvOrder]; + float Y = src[0]; + float Cr = src[1+yuvOrder]; + float Cb = src[2-yuvOrder]; float b = Y + (Cb - delta)*C3; float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; @@ -1037,23 +634,23 @@ struct YCrCb2RGB_f int dstcn, blueIdx; bool isCrCb; float coeffs[4]; - - __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; - bool haveSIMD; }; -#endif template struct YCrCb2RGB_i { typedef _Tp channel_type; + static const int shift = yuv_shift; YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0])); + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } } void operator()(const _Tp* src, _Tp* dst, int n) const @@ -1069,9 +666,9 @@ template struct YCrCb2RGB_i _Tp Cr = src[i+1+yuvOrder]; _Tp Cb = src[i+2-yuvOrder]; - int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + int b = Y + CV_DESCALE((Cb - delta)*C3, shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, shift); dst[bidx] = saturate_cast<_Tp>(b); dst[1] = saturate_cast<_Tp>(g); @@ -1085,27 +682,22 @@ template struct YCrCb2RGB_i int coeffs[4]; }; -#if CV_NEON template <> struct YCrCb2RGB_i { typedef uchar channel_type; + static const int shift = yuv_shift; YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0])); - - v_c0 = vdupq_n_s32(coeffs[0]); - v_c1 = vdupq_n_s32(coeffs[1]); - v_c2 = vdupq_n_s32(coeffs[2]); - v_c3 = vdupq_n_s32(coeffs[3]); - v_delta = vdup_n_s16(ColorChannel::half()); - v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); - v_alpha = vdup_n_u8(ColorChannel::max()); + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } } void operator()(const uchar* src, uchar* dst, int n) const @@ -1114,217 +706,124 @@ struct YCrCb2RGB_i int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - for ( ; i <= n - 24; i += 24, dst += dcn * 8) +#if CV_SIMD + const int vsize = v_uint8::nlanes; + v_uint8 valpha = vx_setall_u8(alpha); + v_uint8 vdelta = vx_setall_u8(delta); + const int descaleShift = 1 << (shift - 1); + v_int32 vdescale = vx_setall_s32(descaleShift); + + v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); + // if YUV then C3 > 2^15, need to subtract it + // to fit in short by short multiplication + v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); + + for( ; i <= n-vsize; + i += vsize, src += 3*vsize, dst += dcn*vsize) { - uint8x8x3_t v_src = vld3_u8(src + i); - int16x8x3_t v_src16; - v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); - v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); - v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); - - int16x4_t v_Y = vget_low_s16(v_src16.val[0]), - v_Cr = vget_low_s16(v_src16.val[1+yuvOrder]), - v_Cb = vget_low_s16(v_src16.val[2-yuvOrder]); - - int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta)); - v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y); - int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2); - v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y); - int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta)); - v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y); - - v_Y = vget_high_s16(v_src16.val[0]); - v_Cr = vget_high_s16(v_src16.val[1+yuvOrder]); - v_Cb = vget_high_s16(v_src16.val[2-yuvOrder]); - - int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta)); - v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y); - int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2); - v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y); - int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta)); - v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y); - - uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1))); - uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1))); - uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1))); - - if (dcn == 3) + v_uint8 y, cr, cb; + if(yuvOrder) { - uint8x8x3_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - vst3_u8(dst, v_dst); + v_load_deinterleave(src, y, cb, cr); } else { - uint8x8x4_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - v_dst.val[3] = v_alpha; - vst4_u8(dst, v_dst); + v_load_deinterleave(src, y, cr, cb); } - } - - for ( ; i < n; i += 3, dst += dcn) - { - uchar Y = src[i]; - uchar Cr = src[i+1+yuvOrder]; - uchar Cb = src[i+2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); - - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; - int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2; - int16x4_t v_delta; - uint8x8_t v_alpha; -}; + cr = v_sub_wrap(cr, vdelta); + cb = v_sub_wrap(cb, vdelta); -template <> -struct YCrCb2RGB_i -{ - typedef ushort channel_type; + v_int8 scr = v_reinterpret_as_s8(cr); + v_int8 scb = v_reinterpret_as_s8(cb); - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0])); - - v_c0 = vdupq_n_s32(coeffs[0]); - v_c1 = vdupq_n_s32(coeffs[1]); - v_c2 = vdupq_n_s32(coeffs[2]); - v_c3 = vdupq_n_s32(coeffs[3]); - v_delta = vdupq_n_s32(ColorChannel::half()); - v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); - v_alpha = vdupq_n_u16(ColorChannel::max()); - v_alpha2 = vget_low_u16(v_alpha); - } + v_int16 scr0, scr1, scb0, scb1; + v_expand(scr, scr0, scr1); + v_expand(scb, scb0, scb1); - void operator()(const ushort* src, ushort* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; + v_int32 b00, b01, b10, b11; + v_int32 g00, g01, g10, g11; + v_int32 r00, r01, r10, r11; - for ( ; i <= n - 24; i += 24, dst += dcn * 8) - { - uint16x8x3_t v_src = vld3q_u16(src + i); - - int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))), - v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1+yuvOrder]))), - v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2-yuvOrder]))); - - int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); - v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y); - int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); - v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y); - int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta)); - v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y); - - v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))), - v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1+yuvOrder]))), - v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2-yuvOrder]))); - - int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); - v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y); - int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); - v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y); - int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta)); - v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y); - - uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1)); - uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1)); - uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1)); - - if (dcn == 3) + v_mul_expand(scb0, vc3, b00, b01); + v_mul_expand(scb1, vc3, b10, b11); + if(yuvOrder) { - uint16x8x3_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - vst3q_u16(dst, v_dst); + // if YUV then C3 > 2^15 + // so we fix the multiplication + v_int32 cb00, cb01, cb10, cb11; + v_expand(scb0, cb00, cb01); + v_expand(scb1, cb10, cb11); + b00 += cb00 << 15; b01 += cb01 << 15; + b10 += cb10 << 15; b11 += cb11 << 15; } - else - { - uint16x8x4_t v_dst; - v_dst.val[bidx] = v_b; - v_dst.val[1] = v_g; - v_dst.val[bidx^2] = v_r; - v_dst.val[3] = v_alpha; - vst4q_u16(dst, v_dst); - } - } - for ( ; i <= n - 12; i += 12, dst += dcn * 4) - { - uint16x4x3_t v_src = vld3_u16(src + i); - - int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])), - v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1+yuvOrder])), - v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2-yuvOrder])); - - int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); - v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y); - int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); - v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y); - int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0); - v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y); - - uint16x4_t v_bd = vqmovun_s32(v_b); - uint16x4_t v_gd = vqmovun_s32(v_g); - uint16x4_t v_rd = vqmovun_s32(v_r); - - if (dcn == 3) + v_int32 t00, t01, t10, t11; + v_mul_expand(scb0, vc2, t00, t01); + v_mul_expand(scb1, vc2, t10, t11); + v_mul_expand(scr0, vc1, g00, g01); + v_mul_expand(scr1, vc1, g10, g11); + g00 += t00; g01 += t01; + g10 += t10; g11 += t11; + v_mul_expand(scr0, vc0, r00, r01); + v_mul_expand(scr1, vc0, r10, r11); + + b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift; + b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift; + g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift; + g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift; + r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift; + r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift; + + v_int16 b0, b1, g0, g1, r0, r1; + b0 = v_pack(b00, b01); b1 = v_pack(b10, b11); + g0 = v_pack(g00, g01); g1 = v_pack(g10, g11); + r0 = v_pack(r00, r01); r1 = v_pack(r10, r11); + + v_uint16 y0, y1; + v_expand(y, y0, y1); + v_int16 sy0, sy1; + sy0 = v_reinterpret_as_s16(y0); + sy1 = v_reinterpret_as_s16(y1); + + b0 = v_add_wrap(b0, sy0); b1 = v_add_wrap(b1, sy1); + g0 = v_add_wrap(g0, sy0); g1 = v_add_wrap(g1, sy1); + r0 = v_add_wrap(r0, sy0); r1 = v_add_wrap(r1, sy1); + + v_uint8 b, g, r; + b = v_pack_u(b0, b1); + g = v_pack_u(g0, g1); + r = v_pack_u(r0, r1); + + if(bidx) + swap(r, b); + + if(dcn == 3) { - uint16x4x3_t v_dst; - v_dst.val[bidx] = v_bd; - v_dst.val[1] = v_gd; - v_dst.val[bidx^2] = v_rd; - vst3_u16(dst, v_dst); + v_store_interleave(dst, b, g, r); } else { - uint16x4x4_t v_dst; - v_dst.val[bidx] = v_bd; - v_dst.val[1] = v_gd; - v_dst.val[bidx^2] = v_rd; - v_dst.val[3] = v_alpha2; - vst4_u16(dst, v_dst); + v_store_interleave(dst, b, g, r, valpha); } } + vx_cleanup(); +#endif - for ( ; i < n; i += 3, dst += dcn) + for ( ; i < n; i++, src += 3, dst += dcn) { - ushort Y = src[i]; - ushort Cr = src[i+1+yuvOrder]; - ushort Cb = src[i+2-yuvOrder]; + uchar Y = src[0]; + uchar Cr = src[1+yuvOrder]; + uchar Cb = src[2-yuvOrder]; - int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + int b = Y + CV_DESCALE((Cb - delta)*C3, shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); if( dcn == 4 ) dst[3] = alpha; } @@ -1332,348 +831,135 @@ struct YCrCb2RGB_i int dstcn, blueIdx; bool isCrCb; int coeffs[4]; - - int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta; - uint16x8_t v_alpha; - uint16x4_t v_alpha2; }; -#elif CV_SSE2 template <> -struct YCrCb2RGB_i +struct YCrCb2RGB_i { - typedef uchar channel_type; + typedef ushort channel_type; + static const int shift = yuv_shift; YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) { static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 4*sizeof(coeffs[0])); - - v_c0 = _mm_set1_epi16((short)coeffs[0]); - v_c1 = _mm_set1_epi16((short)coeffs[1]); - v_c2 = _mm_set1_epi16((short)coeffs[2]); - v_c3 = _mm_set1_epi16((short)coeffs[3]); - v_delta = _mm_set1_epi16(ColorChannel::half()); - v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); - v_zero = _mm_setzero_si128(); - - uchar alpha = ColorChannel::max(); - v_alpha = _mm_set1_epi8(*(char *)&alpha); - - // when using YUV, one of coefficients is bigger than std::numeric_limits::max(), - //which is not appropriate for SSE - useSSE = isCrCb; - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - } - -#if CV_SSE4_1 - // 16s x 8 - void process(__m128i* v_src, __m128i* v_shuffle, - __m128i* v_coeffs) const - { - __m128i v_ycrcb[3]; - v_ycrcb[0] = _mm_shuffle_epi8(v_src[0], v_shuffle[0]); - v_ycrcb[1] = _mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 8), v_shuffle[0]); - v_ycrcb[2] = _mm_shuffle_epi8(v_src[1], v_shuffle[0]); - - __m128i v_y[3]; - v_y[1] = _mm_shuffle_epi8(v_src[0], v_shuffle[1]); - v_y[2] = _mm_srli_si128(_mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 15), v_shuffle[1]), 1); - v_y[0] = _mm_unpacklo_epi8(v_y[1], v_zero); - v_y[1] = _mm_unpackhi_epi8(v_y[1], v_zero); - v_y[2] = _mm_unpacklo_epi8(v_y[2], v_zero); - - __m128i v_rgb[6]; - v_rgb[0] = _mm_unpacklo_epi8(v_ycrcb[0], v_zero); - v_rgb[1] = _mm_unpackhi_epi8(v_ycrcb[0], v_zero); - v_rgb[2] = _mm_unpacklo_epi8(v_ycrcb[1], v_zero); - v_rgb[3] = _mm_unpackhi_epi8(v_ycrcb[1], v_zero); - v_rgb[4] = _mm_unpacklo_epi8(v_ycrcb[2], v_zero); - v_rgb[5] = _mm_unpackhi_epi8(v_ycrcb[2], v_zero); - - v_rgb[0] = _mm_sub_epi16(v_rgb[0], v_delta); - v_rgb[1] = _mm_sub_epi16(v_rgb[1], v_delta); - v_rgb[2] = _mm_sub_epi16(v_rgb[2], v_delta); - v_rgb[3] = _mm_sub_epi16(v_rgb[3], v_delta); - v_rgb[4] = _mm_sub_epi16(v_rgb[4], v_delta); - v_rgb[5] = _mm_sub_epi16(v_rgb[5], v_delta); - - v_rgb[0] = _mm_madd_epi16(v_rgb[0], v_coeffs[0]); - v_rgb[1] = _mm_madd_epi16(v_rgb[1], v_coeffs[1]); - v_rgb[2] = _mm_madd_epi16(v_rgb[2], v_coeffs[2]); - v_rgb[3] = _mm_madd_epi16(v_rgb[3], v_coeffs[0]); - v_rgb[4] = _mm_madd_epi16(v_rgb[4], v_coeffs[1]); - v_rgb[5] = _mm_madd_epi16(v_rgb[5], v_coeffs[2]); - - v_rgb[0] = _mm_add_epi32(v_rgb[0], v_delta2); - v_rgb[1] = _mm_add_epi32(v_rgb[1], v_delta2); - v_rgb[2] = _mm_add_epi32(v_rgb[2], v_delta2); - v_rgb[3] = _mm_add_epi32(v_rgb[3], v_delta2); - v_rgb[4] = _mm_add_epi32(v_rgb[4], v_delta2); - v_rgb[5] = _mm_add_epi32(v_rgb[5], v_delta2); - - v_rgb[0] = _mm_srai_epi32(v_rgb[0], yuv_shift); - v_rgb[1] = _mm_srai_epi32(v_rgb[1], yuv_shift); - v_rgb[2] = _mm_srai_epi32(v_rgb[2], yuv_shift); - v_rgb[3] = _mm_srai_epi32(v_rgb[3], yuv_shift); - v_rgb[4] = _mm_srai_epi32(v_rgb[4], yuv_shift); - v_rgb[5] = _mm_srai_epi32(v_rgb[5], yuv_shift); - - v_rgb[0] = _mm_packs_epi32(v_rgb[0], v_rgb[1]); - v_rgb[2] = _mm_packs_epi32(v_rgb[2], v_rgb[3]); - v_rgb[4] = _mm_packs_epi32(v_rgb[4], v_rgb[5]); - - v_rgb[0] = _mm_add_epi16(v_rgb[0], v_y[0]); - v_rgb[2] = _mm_add_epi16(v_rgb[2], v_y[1]); - v_rgb[4] = _mm_add_epi16(v_rgb[4], v_y[2]); - - v_src[0] = _mm_packus_epi16(v_rgb[0], v_rgb[2]); - v_src[1] = _mm_packus_epi16(v_rgb[4], v_rgb[4]); - } -#endif // CV_SSE4_1 - - // 16s x 8 - void process(__m128i v_y, __m128i v_cr, __m128i v_cb, - __m128i & v_r, __m128i & v_g, __m128i & v_b) const - { - v_cr = _mm_sub_epi16(v_cr, v_delta); - v_cb = _mm_sub_epi16(v_cb, v_delta); - - __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero); - - __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3); - __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2); - __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1); - __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0); - - __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3); - __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2); - __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1); - __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0); - - __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); - __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2), - _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2), - yuv_shift); - __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); - - v_r0 = _mm_add_epi32(v_r0, v_y_p); - v_g0 = _mm_add_epi32(v_g0, v_y_p); - v_b0 = _mm_add_epi32(v_b0, v_y_p); - - v_y_p = _mm_unpackhi_epi16(v_y, v_zero); - - __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); - __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2), - _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2), - yuv_shift); - __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); - - v_r1 = _mm_add_epi32(v_r1, v_y_p); - v_g1 = _mm_add_epi32(v_g1, v_y_p); - v_b1 = _mm_add_epi32(v_b1, v_y_p); - - v_r = _mm_packs_epi32(v_r0, v_r1); - v_g = _mm_packs_epi32(v_g0, v_g1); - v_b = _mm_packs_epi32(v_b0, v_b1); + for(int i = 0; i < 4; i++) + { + coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; + } } - void operator()(const uchar* src, uchar* dst, int n) const + void operator()(const ushort* src, ushort* dst, int n) const { int dcn = dstcn, bidx = blueIdx, i = 0; int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; -#if CV_SSE4_1 - if (checkHardwareSupport(CV_CPU_SSE4_1) && useSSE) +#if CV_SIMD + const int vsize = v_uint16::nlanes; + const int descaleShift = 1 << (shift-1); + v_uint16 valpha = vx_setall_u16(alpha); + v_uint16 vdelta = vx_setall_u16(delta); + v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); + // if YUV then C3 > 2^15, need to subtract it + // to fit in short by short multiplication + v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); + v_int32 vdescale = vx_setall_s32(descaleShift); + for(; i <= n-vsize; + i += vsize, src += vsize*3, dst += vsize*dcn) { - __m128i v_shuffle[2]; - v_shuffle[0] = _mm_set_epi8(0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0); - v_shuffle[1] = _mm_set_epi8(0xf, 0xc, 0xc, 0xc, 0x9, 0x9, 0x9, 0x6, 0x6, 0x6, 0x3, 0x3, 0x3, 0x0, 0x0, 0x0); - __m128i v_coeffs[3]; - v_coeffs[0] = _mm_set_epi16((short)C0, 0, 0, (short)C3, (short)C2, (short)C1, (short)C0, 0); - v_coeffs[1] = _mm_set_epi16((short)C2, (short)C1, (short)C0, 0, 0, (short)C3, (short)C2, (short)C1); - v_coeffs[2] = _mm_set_epi16(0, (short)C3, (short)C2, (short)C1, (short)C0, 0, 0, (short)C3); - - if (dcn == 3) + v_uint16 y, cr, cb; + if(yuvOrder) { - if (bidx == 0) - { - __m128i v_shuffle_dst = _mm_set_epi8(0xf, 0xc, 0xd, 0xe, 0x9, 0xa, 0xb, 0x6, 0x7, 0x8, 0x3, 0x4, 0x5, 0x0, 0x1, 0x2); - for ( ; i <= n - 24; i += 24, dst += dcn * 8) - { - __m128i v_src[2]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); - v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); - - process(v_src, v_shuffle, v_coeffs); - - __m128i v_dst[2]; - v_dst[0] = _mm_shuffle_epi8(v_src[0], v_shuffle_dst); - v_dst[1] = _mm_shuffle_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 15), v_shuffle_dst); - - _mm_storeu_si128((__m128i *)(dst), _mm_alignr_epi8(v_dst[1], _mm_slli_si128(v_dst[0], 1), 1)); - _mm_storel_epi64((__m128i *)(dst + 16), _mm_srli_si128(v_dst[1], 1)); - } - } - else - { - for ( ; i <= n - 24; i += 24, dst += dcn * 8) - { - __m128i v_src[2]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); - v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); - - process(v_src, v_shuffle, v_coeffs); - - _mm_storeu_si128((__m128i *)(dst), v_src[0]); - _mm_storel_epi64((__m128i *)(dst + 16), v_src[1]); - } - } + v_load_deinterleave(src, y, cb, cr); } else { - if (bidx == 0) - { - __m128i v_shuffle_dst = _mm_set_epi8(0x0, 0xa, 0xb, 0xc, 0x0, 0x7, 0x8, 0x9, 0x0, 0x4, 0x5, 0x6, 0x0, 0x1, 0x2, 0x3); - - for ( ; i <= n - 24; i += 24, dst += dcn * 8) - { - __m128i v_src[2]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); - v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); - - process(v_src, v_shuffle, v_coeffs); + v_load_deinterleave(src, y, cr, cb); + } - _mm_storeu_si128((__m128i *)(dst), _mm_shuffle_epi8(_mm_alignr_epi8(v_src[0], v_alpha, 15), v_shuffle_dst)); - _mm_storeu_si128((__m128i *)(dst + 16), _mm_shuffle_epi8(_mm_alignr_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 12), v_alpha, 15), v_shuffle_dst)); - } - } - else - { - __m128i v_shuffle_dst = _mm_set_epi8(0x0, 0xc, 0xb, 0xa, 0x0, 0x9, 0x8, 0x7, 0x0, 0x6, 0x5, 0x4, 0x0, 0x3, 0x2, 0x1); + v_uint32 uy0, uy1; + v_expand(y, uy0, uy1); + v_int32 y0 = v_reinterpret_as_s32(uy0); + v_int32 y1 = v_reinterpret_as_s32(uy1); - for ( ; i <= n - 24; i += 24, dst += dcn * 8) - { - __m128i v_src[2]; - v_src[0] = _mm_loadu_si128((__m128i const *)(src + i)); - v_src[1] = _mm_loadl_epi64((__m128i const *)(src + i + 16)); + cr = v_sub_wrap(cr, vdelta); + cb = v_sub_wrap(cb, vdelta); - process(v_src, v_shuffle, v_coeffs); + v_int32 b0, b1, g0, g1, r0, r1; - _mm_storeu_si128((__m128i *)(dst), _mm_shuffle_epi8(_mm_alignr_epi8(v_src[0], v_alpha, 15), v_shuffle_dst)); - _mm_storeu_si128((__m128i *)(dst + 16), _mm_shuffle_epi8(_mm_alignr_epi8(_mm_alignr_epi8(v_src[1], v_src[0], 12), v_alpha, 15), v_shuffle_dst)); - } - } + v_int16 scb = v_reinterpret_as_s16(cb); + v_int16 scr = v_reinterpret_as_s16(cr); + v_mul_expand(scb, vc3, b0, b1); + if(yuvOrder) + { + // if YUV then C3 > 2^15 + // so we fix the multiplication + v_int32 cb0, cb1; + v_expand(scb, cb0, cb1); + b0 += cb0 << 15; + b1 += cb1 << 15; } - } - else -#endif // CV_SSE4_1 - if (haveSIMD && useSSE) - { - for ( ; i <= n - 96; i += 96, dst += dcn * 32) + v_int32 t0, t1; + v_mul_expand(scb, vc2, t0, t1); + v_mul_expand(scr, vc1, g0, g1); + g0 += t0; g1 += t1; + v_mul_expand(scr, vc0, r0, r1); + + // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits + b0 = ((b0 + vdescale) >> shift) + y0; + b1 = ((b1 + vdescale) >> shift) + y1; + g0 = ((g0 + vdescale) >> shift) + y0; + g1 = ((g1 + vdescale) >> shift) + y1; + r0 = ((r0 + vdescale) >> shift) + y0; + r1 = ((r1 + vdescale) >> shift) + y1; + + // saturate and pack + v_uint16 b, g, r; + b = v_pack_u(b0, b1); + g = v_pack_u(g0, g1); + r = v_pack_u(r0, r1); + + if(bidx) + swap(r, b); + + if(dcn == 3) { - __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i)); - __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16)); - __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32)); - __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48)); - __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64)); - __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80)); - - _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); - - __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero; - process(_mm_unpacklo_epi8(v_y0, v_zero), - _mm_unpacklo_epi8(v_cr0, v_zero), - _mm_unpacklo_epi8(v_cb0, v_zero), - v_r_0, v_g_0, v_b_0); - - __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero; - process(_mm_unpackhi_epi8(v_y0, v_zero), - _mm_unpackhi_epi8(v_cr0, v_zero), - _mm_unpackhi_epi8(v_cb0, v_zero), - v_r_1, v_g_1, v_b_1); - - __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1); - __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1); - __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1); - - process(_mm_unpacklo_epi8(v_y1, v_zero), - _mm_unpacklo_epi8(v_cr1, v_zero), - _mm_unpacklo_epi8(v_cb1, v_zero), - v_r_0, v_g_0, v_b_0); - - process(_mm_unpackhi_epi8(v_y1, v_zero), - _mm_unpackhi_epi8(v_cr1, v_zero), - _mm_unpackhi_epi8(v_cb1, v_zero), - v_r_1, v_g_1, v_b_1); - - __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1); - __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1); - __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1); - - if (bidx == 0) - { - std::swap(v_r0, v_b0); - std::swap(v_r1, v_b1); - } - - __m128i v_a0 = v_alpha, v_a1 = v_alpha; - - if (dcn == 3) - _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); - else - _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, - v_b0, v_b1, v_a0, v_a1); - - _mm_storeu_si128((__m128i *)(dst), v_r0); - _mm_storeu_si128((__m128i *)(dst + 16), v_r1); - _mm_storeu_si128((__m128i *)(dst + 32), v_g0); - _mm_storeu_si128((__m128i *)(dst + 48), v_g1); - _mm_storeu_si128((__m128i *)(dst + 64), v_b0); - _mm_storeu_si128((__m128i *)(dst + 80), v_b1); - - if (dcn == 4) - { - _mm_storeu_si128((__m128i *)(dst + 96), v_a0); - _mm_storeu_si128((__m128i *)(dst + 112), v_a1); - } + v_store_interleave(dst, b, g, r); + } + else + { + v_store_interleave(dst, b, g, r, valpha); } } + vx_cleanup(); +#endif - for ( ; i < n; i += 3, dst += dcn) + for ( ; i < n; i++, src += 3, dst += dcn) { - uchar Y = src[i]; - uchar Cr = src[i+1+yuvOrder]; - uchar Cb = src[i+2-yuvOrder]; + ushort Y = src[0]; + ushort Cr = src[1+yuvOrder]; + ushort Cb = src[2-yuvOrder]; - int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + int b = Y + CV_DESCALE((Cb - delta)*C3, shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); if( dcn == 4 ) dst[3] = alpha; } } int dstcn, blueIdx; - int coeffs[4]; bool isCrCb; - bool useSSE, haveSIMD; - - __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; - __m128i v_delta, v_alpha, v_zero; + int coeffs[4]; }; -#endif // CV_SSE2 - ///////////////////////////////////// YUV420 -> RGB ///////////////////////////////////// @@ -1694,74 +980,59 @@ const int ITUR_BT_601_CBU = 460324; const int ITUR_BT_601_CGV = -385875; const int ITUR_BT_601_CBV = -74448; -template -struct YUV420sp2RGB888Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - int width; - const uchar* my1, *muv; - size_t stride; +//R = 1.164(Y - 16) + 1.596(V - 128) +//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) +//B = 1.164(Y - 16) + 2.018(U - 128) - YUV420sp2RGB888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv) - : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {} +//R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 +//G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 +//B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 - void operator()(const Range& range) const CV_OVERRIDE +template +static inline void cvtYuv42xxp2RGB8(int u, int v, int vy01, int vy11, int vy02, int vy12, + uchar* row1, uchar* row2) +{ + u = u - 128; + v = v - 128; + + int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; + int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; + int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; + + int y00 = std::max(0, vy01 - 16) * ITUR_BT_601_CY; + row1[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); + row1[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); + row1[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); + if(dcn == 4) + row1[3] = uchar(0xff); + + int y01 = std::max(0, vy11 - 16) * ITUR_BT_601_CY; + row1[dcn+2-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); + row1[dcn+1] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); + row1[dcn+0+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); + if(dcn == 4) + row1[7] = uchar(0xff); + + if(is420) { - int rangeBegin = range.start * 2; - int rangeEnd = range.end * 2; - - //R = 1.164(Y - 16) + 1.596(V - 128) - //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) - //B = 1.164(Y - 16) + 2.018(U - 128) - - //R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 - //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 - //B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 - - const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2; - - for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride) - { - uchar* row1 = dst_data + dst_step * j; - uchar* row2 = dst_data + dst_step * (j + 1); - const uchar* y2 = y1 + stride; - - for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6) - { - int u = int(uv[i + 0 + uIdx]) - 128; - int v = int(uv[i + 1 - uIdx]) - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; - - int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY; - row1[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row1[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row1[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); - - int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY; - row1[5-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row1[4] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row1[3+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); - - int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY; - row2[2-bIdx] = saturate_cast((y10 + ruv) >> ITUR_BT_601_SHIFT); - row2[1] = saturate_cast((y10 + guv) >> ITUR_BT_601_SHIFT); - row2[bIdx] = saturate_cast((y10 + buv) >> ITUR_BT_601_SHIFT); - - int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY; - row2[5-bIdx] = saturate_cast((y11 + ruv) >> ITUR_BT_601_SHIFT); - row2[4] = saturate_cast((y11 + guv) >> ITUR_BT_601_SHIFT); - row2[3+bIdx] = saturate_cast((y11 + buv) >> ITUR_BT_601_SHIFT); - } - } + int y10 = std::max(0, vy02 - 16) * ITUR_BT_601_CY; + row2[2-bIdx] = saturate_cast((y10 + ruv) >> ITUR_BT_601_SHIFT); + row2[1] = saturate_cast((y10 + guv) >> ITUR_BT_601_SHIFT); + row2[bIdx] = saturate_cast((y10 + buv) >> ITUR_BT_601_SHIFT); + if(dcn == 4) + row2[3] = uchar(0xff); + + int y11 = std::max(0, vy12 - 16) * ITUR_BT_601_CY; + row2[dcn+2-bIdx] = saturate_cast((y11 + ruv) >> ITUR_BT_601_SHIFT); + row2[dcn+1] = saturate_cast((y11 + guv) >> ITUR_BT_601_SHIFT); + row2[dcn+0+bIdx] = saturate_cast((y11 + buv) >> ITUR_BT_601_SHIFT); + if(dcn == 4) + row2[7] = uchar(0xff); } -}; +} -template -struct YUV420sp2RGBA8888Invoker : ParallelLoopBody +template +struct YUV420sp2RGB8Invoker : ParallelLoopBody { uchar * dst_data; size_t dst_step; @@ -1769,21 +1040,13 @@ struct YUV420sp2RGBA8888Invoker : ParallelLoopBody const uchar* my1, *muv; size_t stride; - YUV420sp2RGBA8888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv) + YUV420sp2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv) : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {} void operator()(const Range& range) const CV_OVERRIDE { - int rangeBegin = range.start * 2; - int rangeEnd = range.end * 2; - - //R = 1.164(Y - 16) + 1.596(V - 128) - //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) - //B = 1.164(Y - 16) + 2.018(U - 128) - - //R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 - //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 - //B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 + const int rangeBegin = range.start * 2; + const int rangeEnd = range.end * 2; const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2; @@ -1793,45 +1056,24 @@ struct YUV420sp2RGBA8888Invoker : ParallelLoopBody uchar* row2 = dst_data + dst_step * (j + 1); const uchar* y2 = y1 + stride; - for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8) + for (int i = 0; i < width; i += 2, row1 += dcn*2, row2 += dcn*2) { - int u = int(uv[i + 0 + uIdx]) - 128; - int v = int(uv[i + 1 - uIdx]) - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; - - int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY; - row1[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row1[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row1[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); - row1[3] = uchar(0xff); - - int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY; - row1[6-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row1[5] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row1[4+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); - row1[7] = uchar(0xff); - - int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY; - row2[2-bIdx] = saturate_cast((y10 + ruv) >> ITUR_BT_601_SHIFT); - row2[1] = saturate_cast((y10 + guv) >> ITUR_BT_601_SHIFT); - row2[bIdx] = saturate_cast((y10 + buv) >> ITUR_BT_601_SHIFT); - row2[3] = uchar(0xff); - - int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY; - row2[6-bIdx] = saturate_cast((y11 + ruv) >> ITUR_BT_601_SHIFT); - row2[5] = saturate_cast((y11 + guv) >> ITUR_BT_601_SHIFT); - row2[4+bIdx] = saturate_cast((y11 + buv) >> ITUR_BT_601_SHIFT); - row2[7] = uchar(0xff); + int u = int(uv[i + 0 + uIdx]); + int v = int(uv[i + 1 - uIdx]); + + int vy01 = int(y1[i]); + int vy11 = int(y1[i + 1]); + int vy02 = int(y2[i]); + int vy12 = int(y2[i + 1]); + + cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); } } } }; -template -struct YUV420p2RGB888Invoker : ParallelLoopBody +template +struct YUV420p2RGB8Invoker : ParallelLoopBody { uchar * dst_data; size_t dst_step; @@ -1840,7 +1082,7 @@ struct YUV420p2RGB888Invoker : ParallelLoopBody size_t stride; int ustepIdx, vstepIdx; - YUV420p2RGB888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx) + YUV420p2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx) : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {} void operator()(const Range& range) const CV_OVERRIDE @@ -1867,149 +1109,39 @@ struct YUV420p2RGB888Invoker : ParallelLoopBody uchar* row2 = dst_data + dst_step * (j + 1); const uchar* y2 = y1 + stride; - for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6) + for (int i = 0; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2) { - int u = int(u1[i]) - 128; - int v = int(v1[i]) - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; - - int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY; - row1[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row1[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row1[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); - - int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY; - row1[5-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row1[4] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row1[3+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); - - int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY; - row2[2-bIdx] = saturate_cast((y10 + ruv) >> ITUR_BT_601_SHIFT); - row2[1] = saturate_cast((y10 + guv) >> ITUR_BT_601_SHIFT); - row2[bIdx] = saturate_cast((y10 + buv) >> ITUR_BT_601_SHIFT); - - int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY; - row2[5-bIdx] = saturate_cast((y11 + ruv) >> ITUR_BT_601_SHIFT); - row2[4] = saturate_cast((y11 + guv) >> ITUR_BT_601_SHIFT); - row2[3+bIdx] = saturate_cast((y11 + buv) >> ITUR_BT_601_SHIFT); - } - } - } -}; - -template -struct YUV420p2RGBA8888Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - int width; - const uchar* my1, *mu, *mv; - size_t stride; - int ustepIdx, vstepIdx; - - YUV420p2RGBA8888Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx) - : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - int rangeBegin = range.start * 2; - int rangeEnd = range.end * 2; - - int uvsteps[2] = {width/2, static_cast(stride) - width/2}; - int usIdx = ustepIdx, vsIdx = vstepIdx; - - const uchar* y1 = my1 + rangeBegin * stride; - const uchar* u1 = mu + (range.start / 2) * stride; - const uchar* v1 = mv + (range.start / 2) * stride; + int u = int(u1[i]); + int v = int(v1[i]); - if(range.start % 2 == 1) - { - u1 += uvsteps[(usIdx++) & 1]; - v1 += uvsteps[(vsIdx++) & 1]; - } - - for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1]) - { - uchar* row1 = dst_data + dst_step * j; - uchar* row2 = dst_data + dst_step * (j + 1); - const uchar* y2 = y1 + stride; + int vy01 = int(y1[2 * i]); + int vy11 = int(y1[2 * i + 1]); + int vy02 = int(y2[2 * i]); + int vy12 = int(y2[2 * i + 1]); - for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8) - { - int u = int(u1[i]) - 128; - int v = int(v1[i]) - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; - - int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY; - row1[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row1[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row1[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); - row1[3] = uchar(0xff); - - int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY; - row1[6-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row1[5] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row1[4+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); - row1[7] = uchar(0xff); - - int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY; - row2[2-bIdx] = saturate_cast((y10 + ruv) >> ITUR_BT_601_SHIFT); - row2[1] = saturate_cast((y10 + guv) >> ITUR_BT_601_SHIFT); - row2[bIdx] = saturate_cast((y10 + buv) >> ITUR_BT_601_SHIFT); - row2[3] = uchar(0xff); - - int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY; - row2[6-bIdx] = saturate_cast((y11 + ruv) >> ITUR_BT_601_SHIFT); - row2[5] = saturate_cast((y11 + guv) >> ITUR_BT_601_SHIFT); - row2[4+bIdx] = saturate_cast((y11 + buv) >> ITUR_BT_601_SHIFT); - row2[7] = uchar(0xff); + cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); } } } }; + #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240) -template +template inline void cvtYUV420sp2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv) { - YUV420sp2RGB888Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _uv); - if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) - parallel_for_(Range(0, dst_height/2), converter); - else - converter(Range(0, dst_height/2)); -} - -template -inline void cvtYUV420sp2RGBA(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv) -{ - YUV420sp2RGBA8888Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _uv); + YUV420sp2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _uv); if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) parallel_for_(Range(0, dst_height/2), converter); else converter(Range(0, dst_height/2)); } -template +template inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx) { - YUV420p2RGB888Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _u, _v, ustepIdx, vstepIdx); - if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) - parallel_for_(Range(0, dst_height/2), converter); - else - converter(Range(0, dst_height/2)); -} - -template -inline void cvtYUV420p2RGBA(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx) -{ - YUV420p2RGBA8888Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _u, _v, ustepIdx, vstepIdx); + YUV420p2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _u, _v, ustepIdx, vstepIdx); if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) parallel_for_(Range(0, dst_height/2), converter); else @@ -2018,9 +1150,9 @@ inline void cvtYUV420p2RGBA(uchar * dst_data, size_t dst_step, int dst_width, in ///////////////////////////////////// RGB -> YUV420p ///////////////////////////////////// -struct RGB888toYUV420pInvoker: public ParallelLoopBody +struct RGB8toYUV420pInvoker: public ParallelLoopBody { - RGB888toYUV420pInvoker(const uchar * _src_data, size_t _src_step, + RGB8toYUV420pInvoker(const uchar * _src_data, size_t _src_step, uchar * _y_data, uchar * _uv_data, size_t _dst_step, int _src_width, int _src_height, int _scn, bool swapBlue_, bool swapUV_, bool interleaved_) : src_data(_src_data), src_step(_src_step), @@ -2103,17 +1235,6 @@ struct RGB888toYUV420pInvoker: public ParallelLoopBody } } - void convert() const - { - if( src_width * src_height >= 320*240 ) - parallel_for_(Range(0, src_height/2), *this); - else - operator()(Range(0, src_height/2)); - } - -private: - RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&); - const uchar * src_data; size_t src_step; uchar *y_data, *uv_data; @@ -2129,8 +1250,8 @@ private: ///////////////////////////////////// YUV422 -> RGB ///////////////////////////////////// -template -struct YUV422toRGB888Invoker : ParallelLoopBody +template +struct YUV422toRGB8Invoker : ParallelLoopBody { uchar * dst_data; size_t dst_step; @@ -2138,9 +1259,9 @@ struct YUV422toRGB888Invoker : ParallelLoopBody size_t src_step; int width; - YUV422toRGB888Invoker(uchar * _dst_data, size_t _dst_step, - const uchar * _src_data, size_t _src_step, - int _width) + YUV422toRGB8Invoker(uchar * _dst_data, size_t _dst_step, + const uchar * _src_data, size_t _src_step, + int _width) : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {} void operator()(const Range& range) const CV_OVERRIDE @@ -2156,76 +1277,15 @@ struct YUV422toRGB888Invoker : ParallelLoopBody { uchar* row = dst_data + dst_step * j; - for (int i = 0; i < 2 * width; i += 4, row += 6) + for (int i = 0; i < 2 * width; i += 4, row += dcn*2) { - int u = int(yuv_src[i + uidx]) - 128; - int v = int(yuv_src[i + vidx]) - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; - - int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY; - row[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); - - int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY; - row[5-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row[4] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row[3+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); - } - } - } -}; - -template -struct YUV422toRGBA8888Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - const uchar * src_data; - size_t src_step; - int width; - - YUV422toRGBA8888Invoker(uchar * _dst_data, size_t _dst_step, - const uchar * _src_data, size_t _src_step, - int _width) - : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - int rangeBegin = range.start; - int rangeEnd = range.end; - - const int uidx = 1 - yIdx + uIdx * 2; - const int vidx = (2 + uidx) % 4; - const uchar* yuv_src = src_data + rangeBegin * src_step; + int u = int(yuv_src[i + uidx]); + int v = int(yuv_src[i + vidx]); - for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step) - { - uchar* row = dst_data + dst_step * j; + int vy0 = int(yuv_src[i + yIdx]); + int vy1 = int(yuv_src[i + yIdx + 2]); - for (int i = 0; i < 2 * width; i += 4, row += 8) - { - int u = int(yuv_src[i + uidx]) - 128; - int v = int(yuv_src[i + vidx]) - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u; - - int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY; - row[2-bIdx] = saturate_cast((y00 + ruv) >> ITUR_BT_601_SHIFT); - row[1] = saturate_cast((y00 + guv) >> ITUR_BT_601_SHIFT); - row[bIdx] = saturate_cast((y00 + buv) >> ITUR_BT_601_SHIFT); - row[3] = uchar(0xff); - - int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY; - row[6-bIdx] = saturate_cast((y01 + ruv) >> ITUR_BT_601_SHIFT); - row[5] = saturate_cast((y01 + guv) >> ITUR_BT_601_SHIFT); - row[4+bIdx] = saturate_cast((y01 + buv) >> ITUR_BT_601_SHIFT); - row[7] = uchar(0xff); + cvtYuv42xxp2RGB8(u, v, vy0, vy1, 0, 0, row, (uchar*)(0)); } } } @@ -2233,22 +1293,11 @@ struct YUV422toRGBA8888Invoker : ParallelLoopBody #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240) -template +template inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step, int width, int height) { - YUV422toRGB888Invoker converter(dst_data, dst_step, src_data, src_step, width); - if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION) - parallel_for_(Range(0, height), converter); - else - converter(Range(0, height)); -} - -template -inline void cvtYUV422toRGBA(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step, - int width, int height) -{ - YUV422toRGBA8888Invoker converter(dst_data, dst_step, src_data, src_step, width); + YUV422toRGB8Invoker converter(dst_data, dst_step, src_data, src_step, width); if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION) parallel_for_(Range(0, height), converter); else @@ -2382,6 +1431,14 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); } +typedef void (*cvt_2plane_yuv_ptr_t)(uchar * /* dst_data*/, + size_t /* dst_step */, + int /* dst_width */, + int /* dst_height */, + size_t /* _stride */, + const uchar* /* _y1 */, + const uchar* /* _uv */); + void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, @@ -2390,21 +1447,37 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src CV_INSTRUMENT_REGION(); // TODO: add hal replacement method + int blueIdx = swapBlue ? 2 : 0; + + cvt_2plane_yuv_ptr_t cvtPtr; switch(dcn*100 + blueIdx * 10 + uIdx) { - case 300: cvtYUV420sp2RGB<0, 0> (dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 301: cvtYUV420sp2RGB<0, 1> (dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 320: cvtYUV420sp2RGB<2, 0> (dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 321: cvtYUV420sp2RGB<2, 1> (dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 400: cvtYUV420sp2RGBA<0, 0>(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 401: cvtYUV420sp2RGBA<0, 1>(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 420: cvtYUV420sp2RGBA<2, 0>(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; - case 421: cvtYUV420sp2RGBA<2, 1>(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); break; + case 300: cvtPtr = cvtYUV420sp2RGB<0, 0, 3>; break; + case 301: cvtPtr = cvtYUV420sp2RGB<0, 1, 3>; break; + case 320: cvtPtr = cvtYUV420sp2RGB<2, 0, 3>; break; + case 321: cvtPtr = cvtYUV420sp2RGB<2, 1, 3>; break; + case 400: cvtPtr = cvtYUV420sp2RGB<0, 0, 4>; break; + case 401: cvtPtr = cvtYUV420sp2RGB<0, 1, 4>; break; + case 420: cvtPtr = cvtYUV420sp2RGB<2, 0, 4>; break; + case 421: cvtPtr = cvtYUV420sp2RGB<2, 1, 4>; break; default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; }; + + cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); } +typedef void (*cvt_3plane_yuv_ptr_t)(uchar * /* dst_data */, + size_t /* dst_step */, + int /* dst_width */, + int /* dst_height */, + size_t /* _stride */, + const uchar* /* _y1 */, + const uchar* /* _u */, + const uchar* /* _v */, + int /* ustepIdx */, + int /* vstepIdx */); + void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, @@ -2422,14 +1495,17 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); } int blueIdx = swapBlue ? 2 : 0; + cvt_3plane_yuv_ptr_t cvtPtr; switch(dcn*10 + blueIdx) { - case 30: cvtYUV420p2RGB<0>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break; - case 32: cvtYUV420p2RGB<2>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break; - case 40: cvtYUV420p2RGBA<0>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break; - case 42: cvtYUV420p2RGBA<2>(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); break; + case 30: cvtPtr = cvtYUV420p2RGB<0, 3>; break; + case 32: cvtPtr = cvtYUV420p2RGB<2, 3>; break; + case 40: cvtPtr = cvtYUV420p2RGB<0, 4>; break; + case 42: cvtPtr = cvtYUV420p2RGB<2, 4>; break; default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; }; + + cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); } void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, @@ -2441,7 +1517,14 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx); uchar * uv_data = dst_data + dst_step * height; - RGB888toYUV420pInvoker(src_data, src_step, dst_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx == 2, false).convert(); + + RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height, + scn, swapBlue, uIdx == 2, false); + + if( width * height >= 320*240 ) + parallel_for_(Range(0, height/2), cvt); + else + cvt(Range(0, height/2)); } void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, @@ -2452,9 +1535,23 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); // TODO: add hal replacement method - RGB888toYUV420pInvoker(src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx == 2, true).convert(); + + RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height, + scn, swapBlue, uIdx == 2, true); + + if( width * height >= 320*240 ) + parallel_for_(Range(0, height/2), cvt); + else + cvt(Range(0, height/2)); } +typedef void (*cvt_1plane_yuv_ptr_t)(uchar * /* dst_data */, + size_t /* dst_step */, + const uchar * /* src_data */, + size_t /* src_step */, + int /* width */, + int /* height */); + void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, @@ -2463,23 +1560,27 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn); + + cvt_1plane_yuv_ptr_t cvtPtr; int blueIdx = swapBlue ? 2 : 0; switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn) { - case 3000: cvtYUV422toRGB<0,0,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 3001: cvtYUV422toRGB<0,0,1>(dst_data, dst_step, src_data, src_step, width, height); break; - case 3010: cvtYUV422toRGB<0,1,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 3200: cvtYUV422toRGB<2,0,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 3201: cvtYUV422toRGB<2,0,1>(dst_data, dst_step, src_data, src_step, width, height); break; - case 3210: cvtYUV422toRGB<2,1,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 4000: cvtYUV422toRGBA<0,0,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 4001: cvtYUV422toRGBA<0,0,1>(dst_data, dst_step, src_data, src_step, width, height); break; - case 4010: cvtYUV422toRGBA<0,1,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 4200: cvtYUV422toRGBA<2,0,0>(dst_data, dst_step, src_data, src_step, width, height); break; - case 4201: cvtYUV422toRGBA<2,0,1>(dst_data, dst_step, src_data, src_step, width, height); break; - case 4210: cvtYUV422toRGBA<2,1,0>(dst_data, dst_step, src_data, src_step, width, height); break; + case 3000: cvtPtr = cvtYUV422toRGB<0,0,0,3>; break; + case 3001: cvtPtr = cvtYUV422toRGB<0,0,1,3>; break; + case 3010: cvtPtr = cvtYUV422toRGB<0,1,0,3>; break; + case 3200: cvtPtr = cvtYUV422toRGB<2,0,0,3>; break; + case 3201: cvtPtr = cvtYUV422toRGB<2,0,1,3>; break; + case 3210: cvtPtr = cvtYUV422toRGB<2,1,0,3>; break; + case 4000: cvtPtr = cvtYUV422toRGB<0,0,0,4>; break; + case 4001: cvtPtr = cvtYUV422toRGB<0,0,1,4>; break; + case 4010: cvtPtr = cvtYUV422toRGB<0,1,0,4>; break; + case 4200: cvtPtr = cvtYUV422toRGB<2,0,0,4>; break; + case 4201: cvtPtr = cvtYUV422toRGB<2,0,1,4>; break; + case 4210: cvtPtr = cvtYUV422toRGB<2,1,0,4>; break; default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; }; + + cvtPtr(dst_data, dst_step, src_data, src_step, width, height); } } // namespace hal -- 2.7.4