float coeffs[3];
};
-
template<> struct RGB2Gray<uchar>
{
typedef uchar channel_type;
int tab[256*3];
};
+#if CV_NEON
+
+template <>
+struct RGB2Gray<ushort>
+{
+ typedef ushort channel_type;
+
+ RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
+ srccn(_srccn)
+ {
+ static const int coeffs0[] = { R2Y, G2Y, B2Y };
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+ if( blueIdx == 0 )
+ std::swap(coeffs[0], coeffs[2]);
+
+ v_cb = vdup_n_u16(coeffs[0]);
+ v_cg = vdup_n_u16(coeffs[1]);
+ v_cr = vdup_n_u16(coeffs[2]);
+ v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
+ }
+
+ void operator()(const ushort* src, ushort* dst, int n) const
+ {
+ int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
+
+ for ( ; i <= n - 8; i += 8, src += scn * 8)
+ {
+ uint16x8_t v_b, v_r, v_g;
+ if (scn == 3)
+ {
+ uint16x8x3_t v_src = vld3q_u16(src);
+ v_b = v_src.val[0];
+ v_g = v_src.val[1];
+ v_r = v_src.val[2];
+ }
+ else
+ {
+ uint16x8x4_t v_src = vld4q_u16(src);
+ v_b = v_src.val[0];
+ v_g = v_src.val[1];
+ v_r = v_src.val[2];
+ }
+
+ uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
+ vmull_u16(vget_low_u16(v_b), v_cb),
+ vget_low_u16(v_g), v_cg),
+ vget_low_u16(v_r), v_cr);
+ uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
+ vmull_u16(vget_high_u16(v_b), v_cb),
+ vget_high_u16(v_g), v_cg),
+ vget_high_u16(v_r), v_cr);
+
+ uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
+ uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
+
+ vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
+ }
+
+ for ( ; i <= n - 4; i += 4, src += scn * 4)
+ {
+ uint16x4_t v_b, v_r, v_g;
+ if (scn == 3)
+ {
+ uint16x4x3_t v_src = vld3_u16(src);
+ v_b = v_src.val[0];
+ v_g = v_src.val[1];
+ v_r = v_src.val[2];
+ }
+ else
+ {
+ uint16x4x4_t v_src = vld4_u16(src);
+ v_b = v_src.val[0];
+ v_g = v_src.val[1];
+ v_r = v_src.val[2];
+ }
+
+ uint32x4_t v_dst = vmlal_u16(vmlal_u16(
+ vmull_u16(v_b, v_cb),
+ v_g, v_cg),
+ v_r, v_cr);
+
+ vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
+ }
+
+ for( ; i < n; i++, src += scn)
+ dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+ }
+
+ int srccn, coeffs[3];
+ uint16x4_t v_cb, v_cg, v_cr;
+ uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2Gray<float>
+{
+ typedef float channel_type;
+
+ RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+ {
+ static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+ memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+ if(blueIdx == 0)
+ std::swap(coeffs[0], coeffs[2]);
+
+ v_cb = vdupq_n_f32(coeffs[0]);
+ v_cg = vdupq_n_f32(coeffs[1]);
+ v_cr = vdupq_n_f32(coeffs[2]);
+ }
+
+ void operator()(const float * src, float * dst, int n) const
+ {
+ int scn = srccn, i = 0;
+ float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+
+ if (scn == 3)
+ {
+ for ( ; i <= n - 8; i += 8, src += scn * 8)
+ {
+ float32x4x3_t v_src = vld3q_f32(src);
+ vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+ v_src = vld3q_f32(src + scn * 4);
+ vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+ }
+
+ for ( ; i <= n - 4; i += 4, src += scn * 4)
+ {
+ float32x4x3_t v_src = vld3q_f32(src);
+ vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+ }
+ }
+ else
+ {
+ for ( ; i <= n - 8; i += 8, src += scn * 8)
+ {
+ float32x4x4_t v_src = vld4q_f32(src);
+ vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+ v_src = vld4q_f32(src + scn * 4);
+ vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+ }
+
+ for ( ; i <= n - 4; i += 4, src += scn * 4)
+ {
+ float32x4x4_t v_src = vld4q_f32(src);
+ vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+ }
+ }
+
+ for ( ; i < n; i++, src += scn)
+ dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
+ }
+
+ int srccn;
+ float coeffs[3];
+ float32x4_t v_cb, v_cg, v_cr;
+};
+
+#else
template<> struct RGB2Gray<ushort>
{
int coeffs[3];
};
+#endif
///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
float coeffs[5];
};
+#if CV_NEON
+
+template <>
+struct RGB2YCrCb_f<float>
+{
+ typedef float channel_type;
+
+ RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
+ srccn(_srccn), blueIdx(_blueIdx)
+ {
+ static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+ if(blueIdx==0)
+ std::swap(coeffs[0], coeffs[2]);
+
+ v_c0 = vdupq_n_f32(coeffs[0]);
+ v_c1 = vdupq_n_f32(coeffs[1]);
+ v_c2 = vdupq_n_f32(coeffs[2]);
+ v_c3 = vdupq_n_f32(coeffs[3]);
+ v_c4 = vdupq_n_f32(coeffs[4]);
+ v_delta = vdupq_n_f32(ColorChannel<float>::half());
+ }
+
+ void operator()(const float * src, float * dst, int n) const
+ {
+ int scn = srccn, bidx = blueIdx, i = 0;
+ const float delta = ColorChannel<float>::half();
+ float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+ n *= 3;
+
+ if (scn == 3)
+ for ( ; i <= n - 12; i += 12, src += 12)
+ {
+ float32x4x3_t v_src = vld3q_f32(src), v_dst;
+ v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+ v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+ v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+
+ vst3q_f32(dst + i, v_dst);
+ }
+ else
+ for ( ; i <= n - 12; i += 12, src += 16)
+ {
+ float32x4x4_t v_src = vld4q_f32(src);
+ float32x4x3_t v_dst;
+ v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+ v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+ v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+
+ vst3q_f32(dst + i, v_dst);
+ }
+
+ for ( ; i < n; i += 3, src += scn)
+ {
+ float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
+ float Cr = (src[bidx^2] - Y)*C3 + delta;
+ float Cb = (src[bidx] - Y)*C4 + delta;
+ dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+ }
+ }
+ int srccn, blueIdx;
+ float coeffs[5];
+ float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
+};
+
+#endif
template<typename _Tp> struct RGB2YCrCb_i
{
int coeffs[5];
};
+#if CV_NEON
+
+template <>
+struct RGB2YCrCb_i<uchar>
+{
+ typedef uchar channel_type;
+
+ RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+ : srccn(_srccn), blueIdx(_blueIdx)
+ {
+ static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+ if (blueIdx==0)
+ std::swap(coeffs[0], coeffs[2]);
+
+ v_c0 = vdup_n_s16(coeffs[0]);
+ v_c1 = vdup_n_s16(coeffs[1]);
+ v_c2 = vdup_n_s16(coeffs[2]);
+ v_c3 = vdupq_n_s32(coeffs[3]);
+ v_c4 = vdupq_n_s32(coeffs[4]);
+ v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
+ v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+ }
+
+ void operator()(const uchar * src, uchar * dst, int n) const
+ {
+ int scn = srccn, bidx = blueIdx, i = 0;
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+ int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, src += scn * 8)
+ {
+ uint8x8x3_t v_dst;
+ int16x8x3_t v_src16;
+
+ if (scn == 3)
+ {
+ uint8x8x3_t v_src = vld3_u8(src);
+ v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+ v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+ v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+ }
+ else
+ {
+ uint8x8x4_t v_src = vld4_u8(src);
+ v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+ v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+ v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+ }
+
+ int16x4x3_t v_src0;
+ v_src0.val[0] = vget_low_s16(v_src16.val[0]);
+ v_src0.val[1] = vget_low_s16(v_src16.val[1]);
+ v_src0.val[2] = vget_low_s16(v_src16.val[2]);
+
+ int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+ v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
+ int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
+ v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
+ int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
+ v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
+
+ v_src0.val[0] = vget_high_s16(v_src16.val[0]);
+ v_src0.val[1] = vget_high_s16(v_src16.val[1]);
+ v_src0.val[2] = vget_high_s16(v_src16.val[2]);
+
+ int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+ v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
+ int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
+ v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
+ int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
+ v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
+
+ v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
+ v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
+ v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
+
+ vst3_u8(dst + i, v_dst);
+ }
+
+ for ( ; i < n; i += 3, src += scn)
+ {
+ int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+ int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+ int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+ dst[i] = saturate_cast<uchar>(Y);
+ dst[i+1] = saturate_cast<uchar>(Cr);
+ dst[i+2] = saturate_cast<uchar>(Cb);
+ }
+ }
+ int srccn, blueIdx, coeffs[5];
+ int16x4_t v_c0, v_c1, v_c2;
+ int32x4_t v_c3, v_c4, v_delta, v_delta2;
+};
+
+template <>
+struct RGB2YCrCb_i<ushort>
+{
+ typedef ushort channel_type;
+
+ RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+ : srccn(_srccn), blueIdx(_blueIdx)
+ {
+ static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+ if (blueIdx==0)
+ std::swap(coeffs[0], coeffs[2]);
+
+ v_c0 = vdupq_n_s32(coeffs[0]);
+ v_c1 = vdupq_n_s32(coeffs[1]);
+ v_c2 = vdupq_n_s32(coeffs[2]);
+ v_c3 = vdupq_n_s32(coeffs[3]);
+ v_c4 = vdupq_n_s32(coeffs[4]);
+ v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
+ v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+ }
+
+ void operator()(const ushort * src, ushort * dst, int n) const
+ {
+ int scn = srccn, bidx = blueIdx, i = 0;
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+ int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, src += scn * 8)
+ {
+ uint16x8x3_t v_src, v_dst;
+ int32x4x3_t v_src0;
+
+ if (scn == 3)
+ v_src = vld3q_u16(src);
+ else
+ {
+ uint16x8x4_t v_src_ = vld4q_u16(src);
+ v_src.val[0] = v_src_.val[0];
+ v_src.val[1] = v_src_.val[1];
+ v_src.val[2] = v_src_.val[2];
+ }
+
+ v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
+ v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
+ v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+ int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+ v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
+ int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
+ v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
+ int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
+ v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
+
+ v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
+ v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
+ v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+ int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+ v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
+ int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
+ v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
+ int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
+ v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
+
+ v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
+ v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
+ v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
+
+ vst3q_u16(dst + i, v_dst);
+ }
+
+ for ( ; i <= n - 12; i += 12, src += scn * 4)
+ {
+ uint16x4x3_t v_dst;
+ int32x4x3_t v_src0;
+
+ if (scn == 3)
+ {
+ uint16x4x3_t v_src = vld3_u16(src);
+ v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
+ v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
+ v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+ }
+ else
+ {
+ uint16x4x4_t v_src = vld4_u16(src);
+ v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
+ v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
+ v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+ }
+
+ int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+ v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
+ int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
+ v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
+ int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
+ v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
+
+ v_dst.val[0] = vqmovun_s32(v_Y);
+ v_dst.val[1] = vqmovun_s32(v_Cr);
+ v_dst.val[2] = vqmovun_s32(v_Cb);
+
+ vst3_u16(dst + i, v_dst);
+ }
+
+ for ( ; i < n; i += 3, src += scn)
+ {
+ int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+ int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+ int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+ dst[i] = saturate_cast<ushort>(Y);
+ dst[i+1] = saturate_cast<ushort>(Cr);
+ dst[i+2] = saturate_cast<ushort>(Cb);
+ }
+ }
+ int srccn, blueIdx, coeffs[5];
+ int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
+};
+
+#endif
template<typename _Tp> struct YCrCb2RGB_f
{
float coeffs[4];
};
+#if CV_NEON
+
+template <>
+struct YCrCb2RGB_f<float>
+{
+ typedef float channel_type;
+
+ YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
+ : dstcn(_dstcn), blueIdx(_blueIdx)
+ {
+ static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+ v_c0 = vdupq_n_f32(coeffs[0]);
+ v_c1 = vdupq_n_f32(coeffs[1]);
+ v_c2 = vdupq_n_f32(coeffs[2]);
+ v_c3 = vdupq_n_f32(coeffs[3]);
+ v_delta = vdupq_n_f32(ColorChannel<float>::half());
+ v_alpha = vdupq_n_f32(ColorChannel<float>::max());
+ }
+
+ void operator()(const float* src, float* dst, int n) const
+ {
+ int dcn = dstcn, bidx = blueIdx, i = 0;
+ const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
+ float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+ n *= 3;
+
+ if (dcn == 3)
+ for ( ; i <= n - 12; i += 12, dst += 12)
+ {
+ float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
+ float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+
+ v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
+ v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
+ v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
+
+ vst3q_f32(dst, v_dst);
+ }
+ else
+ for ( ; i <= n - 12; i += 12, dst += 16)
+ {
+ float32x4x3_t v_src = vld3q_f32(src + i);
+ float32x4x4_t v_dst;
+ float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+
+ v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
+ v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
+ v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
+ v_dst.val[3] = v_alpha;
+
+ vst4q_f32(dst, v_dst);
+ }
+
+ for ( ; i < n; i += 3, dst += dcn)
+ {
+ float Y = src[i], Cr = src[i+1], Cb = src[i+2];
+
+ float b = Y + (Cb - delta)*C3;
+ float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
+ float r = Y + (Cr - delta)*C0;
+
+ dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
+ if( dcn == 4 )
+ dst[3] = alpha;
+ }
+ }
+ int dstcn, blueIdx;
+ float coeffs[4];
+ float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
+};
+
+#endif
template<typename _Tp> struct YCrCb2RGB_i
{
int coeffs[4];
};
+#if CV_NEON
-////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
-
-static const float sRGB2XYZ_D65[] =
+template <>
+struct YCrCb2RGB_i<uchar>
{
- 0.412453f, 0.357580f, 0.180423f,
- 0.212671f, 0.715160f, 0.072169f,
- 0.019334f, 0.119193f, 0.950227f
-};
+ typedef uchar channel_type;
-static const float XYZ2sRGB_D65[] =
-{
- 3.240479f, -1.53715f, -0.498535f,
- -0.969256f, 1.875991f, 0.041556f,
+ YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+ : dstcn(_dstcn), blueIdx(_blueIdx)
+ {
+ static const int coeffs0[] = {22987, -11698, -5636, 29049};
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+ v_c0 = vdupq_n_s32(coeffs[0]);
+ v_c1 = vdupq_n_s32(coeffs[1]);
+ v_c2 = vdupq_n_s32(coeffs[2]);
+ v_c3 = vdupq_n_s32(coeffs[3]);
+ v_delta = vdup_n_s16(ColorChannel<uchar>::half());
+ v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+ v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+ }
+
+ void operator()(const uchar* src, uchar* dst, int n) const
+ {
+ int dcn = dstcn, bidx = blueIdx, i = 0;
+ const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+ {
+ uint8x8x3_t v_src = vld3_u8(src + i);
+ int16x8x3_t v_src16;
+ v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+ v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+ v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+
+ int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
+ v_Cr = vget_low_s16(v_src16.val[1]),
+ v_Cb = vget_low_s16(v_src16.val[2]);
+
+ int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
+ v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
+ v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
+ v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
+
+ v_Y = vget_high_s16(v_src16.val[0]);
+ v_Cr = vget_high_s16(v_src16.val[1]);
+ v_Cb = vget_high_s16(v_src16.val[2]);
+
+ int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
+ v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
+ v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
+ v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
+
+ uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
+ uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
+ uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
+
+ if (dcn == 3)
+ {
+ uint8x8x3_t v_dst;
+ v_dst.val[bidx] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[bidx^2] = v_r;
+ vst3_u8(dst, v_dst);
+ }
+ else
+ {
+ uint8x8x4_t v_dst;
+ v_dst.val[bidx] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[bidx^2] = v_r;
+ v_dst.val[3] = v_alpha;
+ vst4_u8(dst, v_dst);
+ }
+ }
+
+ for ( ; i < n; i += 3, dst += dcn)
+ {
+ uchar Y = src[i];
+ uchar Cr = src[i+1];
+ uchar Cb = src[i+2];
+
+ int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+ int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+ int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+ dst[bidx] = saturate_cast<uchar>(b);
+ dst[1] = saturate_cast<uchar>(g);
+ dst[bidx^2] = saturate_cast<uchar>(r);
+ if( dcn == 4 )
+ dst[3] = alpha;
+ }
+ }
+ int dstcn, blueIdx;
+ int coeffs[4];
+
+ int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
+ int16x4_t v_delta;
+ uint8x8_t v_alpha;
+};
+
+template <>
+struct YCrCb2RGB_i<ushort>
+{
+ typedef ushort channel_type;
+
+ YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+ : dstcn(_dstcn), blueIdx(_blueIdx)
+ {
+ static const int coeffs0[] = {22987, -11698, -5636, 29049};
+ memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+ v_c0 = vdupq_n_s32(coeffs[0]);
+ v_c1 = vdupq_n_s32(coeffs[1]);
+ v_c2 = vdupq_n_s32(coeffs[2]);
+ v_c3 = vdupq_n_s32(coeffs[3]);
+ v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
+ v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+ v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
+ v_alpha2 = vget_low_u16(v_alpha);
+ }
+
+ void operator()(const ushort* src, ushort* dst, int n) const
+ {
+ int dcn = dstcn, bidx = blueIdx, i = 0;
+ const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+ {
+ uint16x8x3_t v_src = vld3q_u16(src + i);
+
+ int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
+ v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
+ v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+ int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+ v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+ v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
+ v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
+
+ v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
+ v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
+ v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+ int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+ v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+ v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
+ v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
+
+ uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
+ uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
+ uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
+
+ if (dcn == 3)
+ {
+ uint16x8x3_t v_dst;
+ v_dst.val[bidx] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[bidx^2] = v_r;
+ vst3q_u16(dst, v_dst);
+ }
+ else
+ {
+ uint16x8x4_t v_dst;
+ v_dst.val[bidx] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[bidx^2] = v_r;
+ v_dst.val[3] = v_alpha;
+ vst4q_u16(dst, v_dst);
+ }
+ }
+
+ for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+ {
+ uint16x4x3_t v_src = vld3_u16(src + i);
+
+ int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
+ v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
+ v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+
+ int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+ v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+ v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
+ int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
+ v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
+
+ uint16x4_t v_bd = vqmovun_s32(v_b);
+ uint16x4_t v_gd = vqmovun_s32(v_g);
+ uint16x4_t v_rd = vqmovun_s32(v_r);
+
+ if (dcn == 3)
+ {
+ uint16x4x3_t v_dst;
+ v_dst.val[bidx] = v_bd;
+ v_dst.val[1] = v_gd;
+ v_dst.val[bidx^2] = v_rd;
+ vst3_u16(dst, v_dst);
+ }
+ else
+ {
+ uint16x4x4_t v_dst;
+ v_dst.val[bidx] = v_bd;
+ v_dst.val[1] = v_gd;
+ v_dst.val[bidx^2] = v_rd;
+ v_dst.val[3] = v_alpha2;
+ vst4_u16(dst, v_dst);
+ }
+ }
+
+ for ( ; i < n; i += 3, dst += dcn)
+ {
+ ushort Y = src[i];
+ ushort Cr = src[i+1];
+ ushort Cb = src[i+2];
+
+ int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+ int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+ int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+ dst[bidx] = saturate_cast<ushort>(b);
+ dst[1] = saturate_cast<ushort>(g);
+ dst[bidx^2] = saturate_cast<ushort>(r);
+ if( dcn == 4 )
+ dst[3] = alpha;
+ }
+ }
+ int dstcn, blueIdx;
+ int coeffs[4];
+
+ int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
+ uint16x8_t v_alpha;
+ uint16x4_t v_alpha2;
+};
+
+#endif
+
+////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
+
+static const float sRGB2XYZ_D65[] =
+{
+ 0.412453f, 0.357580f, 0.180423f,
+ 0.212671f, 0.715160f, 0.072169f,
+ 0.019334f, 0.119193f, 0.950227f
+};
+
+static const float XYZ2sRGB_D65[] =
+{
+ 3.240479f, -1.53715f, -0.498535f,
+ -0.969256f, 1.875991f, 0.041556f,
0.055648f, -0.204043f, 1.057311f
};
float coeffs[9];
};
+#if CV_NEON
+
+template <>
+struct RGB2XYZ_f<float>
+{
+ typedef float channel_type;
+
+ RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+ {
+ memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
+ if(blueIdx == 0)
+ {
+ std::swap(coeffs[0], coeffs[2]);
+ std::swap(coeffs[3], coeffs[5]);
+ std::swap(coeffs[6], coeffs[8]);
+ }
+
+ v_c0 = vdupq_n_f32(coeffs[0]);
+ v_c1 = vdupq_n_f32(coeffs[1]);
+ v_c2 = vdupq_n_f32(coeffs[2]);
+ v_c3 = vdupq_n_f32(coeffs[3]);
+ v_c4 = vdupq_n_f32(coeffs[4]);
+ v_c5 = vdupq_n_f32(coeffs[5]);
+ v_c6 = vdupq_n_f32(coeffs[6]);
+ v_c7 = vdupq_n_f32(coeffs[7]);
+ v_c8 = vdupq_n_f32(coeffs[8]);
+ }
+
+ void operator()(const float* src, float* dst, int n) const
+ {
+ int scn = srccn, i = 0;
+ float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+ n *= 3;
+
+ if (scn == 3)
+ for ( ; i <= n - 12; i += 12, src += 12)
+ {
+ float32x4x3_t v_src = vld3q_f32(src), v_dst;
+ v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+ v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
+ v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
+ vst3q_f32(dst + i, v_dst);
+ }
+ else
+ for ( ; i <= n - 12; i += 12, src += 16)
+ {
+ float32x4x4_t v_src = vld4q_f32(src);
+ float32x4x3_t v_dst;
+ v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+ v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
+ v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
+ vst3q_f32(dst + i, v_dst);
+ }
+
+ for ( ; i < n; i += 3, src += scn)
+ {
+ float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
+ float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
+ float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
+ dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
+ }
+ }
+
+ int srccn;
+ float coeffs[9];
+ float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+};
+
+#endif
template<typename _Tp> struct RGB2XYZ_i
{
int coeffs[9];
};
+#if CV_NEON
+
+template <>
+struct RGB2XYZ_i<uchar>
+{
+ typedef uchar channel_type;
+
+ RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+ {
+ static const int coeffs0[] =
+ {
+ 1689, 1465, 739,
+ 871, 2929, 296,
+ 79, 488, 3892
+ };
+ for( int i = 0; i < 9; i++ )
+ coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+ if(blueIdx == 0)
+ {
+ std::swap(coeffs[0], coeffs[2]);
+ std::swap(coeffs[3], coeffs[5]);
+ std::swap(coeffs[6], coeffs[8]);
+ }
+
+ v_c0 = vdup_n_u16(coeffs[0]);
+ v_c1 = vdup_n_u16(coeffs[1]);
+ v_c2 = vdup_n_u16(coeffs[2]);
+ v_c3 = vdup_n_u16(coeffs[3]);
+ v_c4 = vdup_n_u16(coeffs[4]);
+ v_c5 = vdup_n_u16(coeffs[5]);
+ v_c6 = vdup_n_u16(coeffs[6]);
+ v_c7 = vdup_n_u16(coeffs[7]);
+ v_c8 = vdup_n_u16(coeffs[8]);
+ v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
+ }
+ void operator()(const uchar * src, uchar * dst, int n) const
+ {
+ int scn = srccn, i = 0;
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, src += scn * 8)
+ {
+ uint8x8x3_t v_dst;
+ uint16x8x3_t v_src16;
+
+ if (scn == 3)
+ {
+ uint8x8x3_t v_src = vld3_u8(src);
+ v_src16.val[0] = vmovl_u8(v_src.val[0]);
+ v_src16.val[1] = vmovl_u8(v_src.val[1]);
+ v_src16.val[2] = vmovl_u8(v_src.val[2]);
+ }
+ else
+ {
+ uint8x8x4_t v_src = vld4_u8(src);
+ v_src16.val[0] = vmovl_u8(v_src.val[0]);
+ v_src16.val[1] = vmovl_u8(v_src.val[1]);
+ v_src16.val[2] = vmovl_u8(v_src.val[2]);
+ }
+
+ uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
+ v_s1 = vget_low_u16(v_src16.val[1]),
+ v_s2 = vget_low_u16(v_src16.val[2]);
+
+ uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
+ v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
+ v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+
+ v_s0 = vget_high_u16(v_src16.val[0]),
+ v_s1 = vget_high_u16(v_src16.val[1]),
+ v_s2 = vget_high_u16(v_src16.val[2]);
+
+ uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
+ v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
+ v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+
+ v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
+ v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
+ v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
+
+ vst3_u8(dst + i, v_dst);
+ }
+
+ for ( ; i < n; i += 3, src += scn)
+ {
+ int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
+ int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
+ int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
+ dst[i] = saturate_cast<uchar>(X);
+ dst[i+1] = saturate_cast<uchar>(Y);
+ dst[i+2] = saturate_cast<uchar>(Z);
+ }
+ }
+
+ int srccn, coeffs[9];
+ uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+ uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2XYZ_i<ushort>
+{
+ typedef ushort channel_type;
+
+ RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+ {
+ static const int coeffs0[] =
+ {
+ 1689, 1465, 739,
+ 871, 2929, 296,
+ 79, 488, 3892
+ };
+ for( int i = 0; i < 9; i++ )
+ coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+ if(blueIdx == 0)
+ {
+ std::swap(coeffs[0], coeffs[2]);
+ std::swap(coeffs[3], coeffs[5]);
+ std::swap(coeffs[6], coeffs[8]);
+ }
+
+ v_c0 = vdup_n_u16(coeffs[0]);
+ v_c1 = vdup_n_u16(coeffs[1]);
+ v_c2 = vdup_n_u16(coeffs[2]);
+ v_c3 = vdup_n_u16(coeffs[3]);
+ v_c4 = vdup_n_u16(coeffs[4]);
+ v_c5 = vdup_n_u16(coeffs[5]);
+ v_c6 = vdup_n_u16(coeffs[6]);
+ v_c7 = vdup_n_u16(coeffs[7]);
+ v_c8 = vdup_n_u16(coeffs[8]);
+ v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
+ }
+
+ void operator()(const ushort * src, ushort * dst, int n) const
+ {
+ int scn = srccn, i = 0;
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, src += scn * 8)
+ {
+ uint16x8x3_t v_src, v_dst;
+
+ if (scn == 3)
+ v_src = vld3q_u16(src);
+ else
+ {
+ uint16x8x4_t v_src4 = vld4q_u16(src);
+ v_src.val[0] = v_src4.val[0];
+ v_src.val[1] = v_src4.val[1];
+ v_src.val[2] = v_src4.val[2];
+ }
+
+ uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
+ v_s1 = vget_low_u16(v_src.val[1]),
+ v_s2 = vget_low_u16(v_src.val[2]);
+
+ uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
+ v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
+ v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+
+ v_s0 = vget_high_u16(v_src.val[0]),
+ v_s1 = vget_high_u16(v_src.val[1]),
+ v_s2 = vget_high_u16(v_src.val[2]);
+
+ uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
+ v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
+ v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+
+ v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
+ v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
+ v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
+
+ vst3q_u16(dst + i, v_dst);
+ }
+
+ for ( ; i <= n - 12; i += 12, src += scn * 4)
+ {
+ uint16x4x3_t v_dst;
+ uint16x4_t v_s0, v_s1, v_s2;
+
+ if (scn == 3)
+ {
+ uint16x4x3_t v_src = vld3_u16(src);
+ v_s0 = v_src.val[0];
+ v_s1 = v_src.val[1];
+ v_s2 = v_src.val[2];
+ }
+ else
+ {
+ uint16x4x4_t v_src = vld4_u16(src);
+ v_s0 = v_src.val[0];
+ v_s1 = v_src.val[1];
+ v_s2 = v_src.val[2];
+ }
+
+ uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+
+ v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
+ v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
+ v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
+
+ vst3_u16(dst + i, v_dst);
+ }
+
+ for ( ; i < n; i += 3, src += scn)
+ {
+ int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
+ int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
+ int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
+ dst[i] = saturate_cast<ushort>(X);
+ dst[i+1] = saturate_cast<ushort>(Y);
+ dst[i+2] = saturate_cast<ushort>(Z);
+ }
+ }
+
+ int srccn, coeffs[9];
+ uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+ uint32x4_t v_delta;
+};
+
+#endif
template<typename _Tp> struct XYZ2RGB_f
{
int coeffs[9];
};
+#if CV_NEON
+
+template <>
+struct XYZ2RGB_i<uchar>
+{
+ typedef uchar channel_type;
+
+ XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+ : dstcn(_dstcn), blueIdx(_blueIdx)
+ {
+ static const int coeffs0[] =
+ {
+ 13273, -6296, -2042,
+ -3970, 7684, 170,
+ 228, -836, 4331
+ };
+ for(int i = 0; i < 9; i++)
+ coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+
+ if(blueIdx == 0)
+ {
+ std::swap(coeffs[0], coeffs[6]);
+ std::swap(coeffs[1], coeffs[7]);
+ std::swap(coeffs[2], coeffs[8]);
+ }
+
+ v_c0 = vdup_n_s16(coeffs[0]);
+ v_c1 = vdup_n_s16(coeffs[1]);
+ v_c2 = vdup_n_s16(coeffs[2]);
+ v_c3 = vdup_n_s16(coeffs[3]);
+ v_c4 = vdup_n_s16(coeffs[4]);
+ v_c5 = vdup_n_s16(coeffs[5]);
+ v_c6 = vdup_n_s16(coeffs[6]);
+ v_c7 = vdup_n_s16(coeffs[7]);
+ v_c8 = vdup_n_s16(coeffs[8]);
+ v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
+ v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
+ }
+
+ void operator()(const uchar* src, uchar* dst, int n) const
+ {
+ int dcn = dstcn, i = 0;
+ uchar alpha = ColorChannel<uchar>::max();
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+ {
+ uint8x8x3_t v_src = vld3_u8(src + i);
+ int16x8x3_t v_src16;
+ v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+ v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+ v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+
+ int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
+ v_s1 = vget_low_s16(v_src16.val[1]),
+ v_s2 = vget_low_s16(v_src16.val[2]);
+
+ int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
+ v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
+ v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+
+ v_s0 = vget_high_s16(v_src16.val[0]),
+ v_s1 = vget_high_s16(v_src16.val[1]),
+ v_s2 = vget_high_s16(v_src16.val[2]);
+
+ int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
+ v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
+ v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+
+ uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
+ uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
+ uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
+
+ if (dcn == 3)
+ {
+ uint8x8x3_t v_dst;
+ v_dst.val[0] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[2] = v_r;
+ vst3_u8(dst, v_dst);
+ }
+ else
+ {
+ uint8x8x4_t v_dst;
+ v_dst.val[0] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[2] = v_r;
+ v_dst.val[3] = v_alpha;
+ vst4_u8(dst, v_dst);
+ }
+ }
+
+ for ( ; i < n; i += 3, dst += dcn)
+ {
+ int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
+ int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
+ int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+ dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
+ dst[2] = saturate_cast<uchar>(R);
+ if( dcn == 4 )
+ dst[3] = alpha;
+ }
+ }
+ int dstcn, blueIdx;
+ int coeffs[9];
+
+ int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+ uint8x8_t v_alpha;
+ int32x4_t v_delta;
+};
+
+template <>
+struct XYZ2RGB_i<ushort>
+{
+ typedef ushort channel_type;
+
+ XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+ : dstcn(_dstcn), blueIdx(_blueIdx)
+ {
+ static const int coeffs0[] =
+ {
+ 13273, -6296, -2042,
+ -3970, 7684, 170,
+ 228, -836, 4331
+ };
+ for(int i = 0; i < 9; i++)
+ coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+
+ if(blueIdx == 0)
+ {
+ std::swap(coeffs[0], coeffs[6]);
+ std::swap(coeffs[1], coeffs[7]);
+ std::swap(coeffs[2], coeffs[8]);
+ }
+
+ v_c0 = vdupq_n_s32(coeffs[0]);
+ v_c1 = vdupq_n_s32(coeffs[1]);
+ v_c2 = vdupq_n_s32(coeffs[2]);
+ v_c3 = vdupq_n_s32(coeffs[3]);
+ v_c4 = vdupq_n_s32(coeffs[4]);
+ v_c5 = vdupq_n_s32(coeffs[5]);
+ v_c6 = vdupq_n_s32(coeffs[6]);
+ v_c7 = vdupq_n_s32(coeffs[7]);
+ v_c8 = vdupq_n_s32(coeffs[8]);
+ v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
+ v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
+ v_alpha2 = vget_low_u16(v_alpha);
+ }
+
+ void operator()(const ushort* src, ushort* dst, int n) const
+ {
+ int dcn = dstcn, i = 0;
+ ushort alpha = ColorChannel<ushort>::max();
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+ n *= 3;
+
+ for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+ {
+ uint16x8x3_t v_src = vld3q_u16(src + i);
+ int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
+ v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
+ v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+ int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
+ v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
+ v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+
+ v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
+ v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
+ v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+ int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
+ v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
+ v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+
+ uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
+ uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
+ uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
+
+ if (dcn == 3)
+ {
+ uint16x8x3_t v_dst;
+ v_dst.val[0] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[2] = v_r;
+ vst3q_u16(dst, v_dst);
+ }
+ else
+ {
+ uint16x8x4_t v_dst;
+ v_dst.val[0] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[2] = v_r;
+ v_dst.val[3] = v_alpha;
+ vst4q_u16(dst, v_dst);
+ }
+ }
+
+ for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+ {
+ uint16x4x3_t v_src = vld3_u16(src + i);
+ int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
+ v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
+ v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+
+ int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+ int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+ int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+ v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
+ v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
+ v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
+
+ uint16x4_t v_b = vqmovun_s32(v_X);
+ uint16x4_t v_g = vqmovun_s32(v_Y);
+ uint16x4_t v_r = vqmovun_s32(v_Z);
+
+ if (dcn == 3)
+ {
+ uint16x4x3_t v_dst;
+ v_dst.val[0] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[2] = v_r;
+ vst3_u16(dst, v_dst);
+ }
+ else
+ {
+ uint16x4x4_t v_dst;
+ v_dst.val[0] = v_b;
+ v_dst.val[1] = v_g;
+ v_dst.val[2] = v_r;
+ v_dst.val[3] = v_alpha2;
+ vst4_u16(dst, v_dst);
+ }
+ }
+
+ for ( ; i < n; i += 3, dst += dcn)
+ {
+ int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
+ int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
+ int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+ dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
+ dst[2] = saturate_cast<ushort>(R);
+ if( dcn == 4 )
+ dst[3] = alpha;
+ }
+ }
+ int dstcn, blueIdx;
+ int coeffs[9];
+
+ int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
+ uint16x4_t v_alpha2;
+ uint16x8_t v_alpha;
+};
+
+#endif
////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////