From a54f6bb08bbd63c16449f685ce1e9cdf8f10ac13 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 5 Oct 2014 09:48:31 -0700
Subject: [PATCH] cv::cvtColor

---
 modules/imgproc/perf/perf_blur.cpp |    7 +-
 modules/imgproc/src/color.cpp      | 1375 +++++++++++++++++++++++++++++++++++-
 modules/imgproc/src/imgwarp.cpp    |    8 -
 3 files changed, 1369 insertions(+), 21 deletions(-)
diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp
index a6e31d3..3fc953e 100644
--- a/modules/imgproc/perf/perf_blur.cpp
+++ b/modules/imgproc/perf/perf_blur.cpp
@@ -98,6 +98,11 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16,
     Size size = get<0>(GetParam());
     int type = get<1>(GetParam());
     BorderType btype = get<2>(GetParam());
+    double eps = 1e-3;
+
+#if CV_NEON
+    eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps;
+#endif
 
     Mat src(size, type);
     Mat dst(size, type);
@@ -106,7 +111,7 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16,
 
     TEST_CYCLE() blur(src, dst, Size(16,16), Point(-1,-1), btype);
 
-    SANITY_CHECK(dst, 1e-3);
+    SANITY_CHECK(dst, eps);
 }
 
 PERF_TEST_P(Size_MatType_BorderType3x3, box3x3,
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 78236e0..27da795 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -769,7 +769,6 @@ template<typename _Tp> struct RGB2Gray
     float coeffs[3];
 };
 
-
 template<> struct RGB2Gray<uchar>
 {
     typedef uchar channel_type;
@@ -800,6 +799,166 @@ template<> struct RGB2Gray<uchar>
     int tab[256*3];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2Gray<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
+        srccn(_srccn)
+    {
+        static const int coeffs0[] = { R2Y, G2Y, B2Y };
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+        if( blueIdx == 0 )
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = vdup_n_u16(coeffs[0]);
+        v_cg = vdup_n_u16(coeffs[1]);
+        v_cr = vdup_n_u16(coeffs[2]);
+        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
+
+        for ( ; i <= n - 8; i += 8, src += scn * 8)
+        {
+            uint16x8_t v_b, v_r, v_g;
+            if (scn == 3)
+            {
+                uint16x8x3_t v_src = vld3q_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+            else
+            {
+                uint16x8x4_t v_src = vld4q_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+
+            uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
+                                           vmull_u16(vget_low_u16(v_b), v_cb),
+                                                     vget_low_u16(v_g), v_cg),
+                                                     vget_low_u16(v_r), v_cr);
+            uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
+                                           vmull_u16(vget_high_u16(v_b), v_cb),
+                                                     vget_high_u16(v_g), v_cg),
+                                                     vget_high_u16(v_r), v_cr);
+
+            uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
+            uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
+
+            vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
+        }
+
+        for ( ; i <= n - 4; i += 4, src += scn * 4)
+        {
+            uint16x4_t v_b, v_r, v_g;
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+
+            uint32x4_t v_dst = vmlal_u16(vmlal_u16(
+                                         vmull_u16(v_b, v_cb),
+                                                   v_g, v_cg),
+                                                   v_r, v_cr);
+
+            vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
+        }
+
+        for( ; i < n; i++, src += scn)
+            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+    }
+
+    int srccn, coeffs[3];
+    uint16x4_t v_cb, v_cg, v_cr;
+    uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2Gray<float>
+{
+    typedef float channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+        if(blueIdx == 0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = vdupq_n_f32(coeffs[0]);
+        v_cg = vdupq_n_f32(coeffs[1]);
+        v_cr = vdupq_n_f32(coeffs[2]);
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+
+        if (scn == 3)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                float32x4x3_t v_src = vld3q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+                v_src = vld3q_f32(src + scn * 4);
+                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+
+            for ( ; i <= n - 4; i += 4, src += scn * 4)
+            {
+                float32x4x3_t v_src = vld3q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+        }
+        else
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+                v_src = vld4q_f32(src + scn * 4);
+                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+
+            for ( ; i <= n - 4; i += 4, src += scn * 4)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+        }
+
+        for ( ; i < n; i++, src += scn)
+            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
+    }
+
+    int srccn;
+    float coeffs[3];
+    float32x4_t v_cb, v_cg, v_cr;
+};
+
+#else
 
 template<> struct RGB2Gray<ushort>
 {
@@ -823,6 +982,7 @@ template<> struct RGB2Gray<ushort>
     int coeffs[3];
 };
 
+#endif
 
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
 
@@ -855,6 +1015,72 @@ template<typename _Tp> struct RGB2YCrCb_f
     float coeffs[5];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2YCrCb_f<float>
+{
+    typedef float channel_type;
+
+    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
+        srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if(blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = vdupq_n_f32(coeffs[0]);
+        v_c1 = vdupq_n_f32(coeffs[1]);
+        v_c2 = vdupq_n_f32(coeffs[2]);
+        v_c3 = vdupq_n_f32(coeffs[3]);
+        v_c4 = vdupq_n_f32(coeffs[4]);
+        v_delta = vdupq_n_f32(ColorChannel<float>::half());
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        const float delta = ColorChannel<float>::half();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        n *= 3;
+
+        if (scn == 3)
+            for ( ; i <= n - 12; i += 12, src += 12)
+            {
+                float32x4x3_t v_src = vld3q_f32(src), v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+
+                vst3q_f32(dst + i, v_dst);
+            }
+        else
+            for ( ; i <= n - 12; i += 12, src += 16)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+
+                vst3q_f32(dst + i, v_dst);
+            }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
+            float Cr = (src[bidx^2] - Y)*C3 + delta;
+            float Cb = (src[bidx] - Y)*C4 + delta;
+            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+        }
+    }
+    int srccn, blueIdx;
+    float coeffs[5];
+    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
+};
+
+#endif
 
 template<typename _Tp> struct RGB2YCrCb_i
 {
@@ -887,6 +1113,224 @@ template<typename _Tp> struct RGB2YCrCb_i
     int coeffs[5];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2YCrCb_i<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+        : srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = vdup_n_s16(coeffs[0]);
+        v_c1 = vdup_n_s16(coeffs[1]);
+        v_c2 = vdup_n_s16(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_c4 = vdupq_n_s32(coeffs[4]);
+        v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint8x8x3_t v_dst;
+            int16x8x3_t v_src16;
+
+            if (scn == 3)
+            {
+                uint8x8x3_t v_src = vld3_u8(src);
+                v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+                v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+                v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+            }
+            else
+            {
+                uint8x8x4_t v_src = vld4_u8(src);
+                v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+                v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+                v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+            }
+
+            int16x4x3_t v_src0;
+            v_src0.val[0] = vget_low_s16(v_src16.val[0]);
+            v_src0.val[1] = vget_low_s16(v_src16.val[1]);
+            v_src0.val[2] = vget_low_s16(v_src16.val[2]);
+
+            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
+            int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
+            v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
+            int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
+            v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
+
+            v_src0.val[0] = vget_high_s16(v_src16.val[0]);
+            v_src0.val[1] = vget_high_s16(v_src16.val[1]);
+            v_src0.val[2] = vget_high_s16(v_src16.val[2]);
+
+            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
+            int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
+            v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
+            int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
+            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
+
+            v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
+            v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
+            v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
+
+            vst3_u8(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+            dst[i] = saturate_cast<uchar>(Y);
+            dst[i+1] = saturate_cast<uchar>(Cr);
+            dst[i+2] = saturate_cast<uchar>(Cb);
+        }
+    }
+    int srccn, blueIdx, coeffs[5];
+    int16x4_t v_c0, v_c1, v_c2;
+    int32x4_t v_c3, v_c4, v_delta, v_delta2;
+};
+
+template <>
+struct RGB2YCrCb_i<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+        : srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_c4 = vdupq_n_s32(coeffs[4]);
+        v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const ushort * src, ushort * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint16x8x3_t v_src, v_dst;
+            int32x4x3_t v_src0;
+
+            if (scn == 3)
+                v_src = vld3q_u16(src);
+            else
+            {
+                uint16x8x4_t v_src_ = vld4q_u16(src);
+                v_src.val[0] = v_src_.val[0];
+                v_src.val[1] = v_src_.val[1];
+                v_src.val[2] = v_src_.val[2];
+            }
+
+            v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
+            v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
+            v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
+            int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
+            v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
+            int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
+            v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
+
+            v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
+            v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
+            v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
+            int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
+            v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
+            int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
+            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
+
+            v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
+            v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
+            v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
+
+            vst3q_u16(dst + i, v_dst);
+        }
+
+        for ( ; i <= n - 12; i += 12, src += scn * 4)
+        {
+            uint16x4x3_t v_dst;
+            int32x4x3_t v_src0;
+
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
+                v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
+                v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
+                v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
+                v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+            }
+
+            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
+            int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
+            v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
+            int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
+            v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
+
+            v_dst.val[0] = vqmovun_s32(v_Y);
+            v_dst.val[1] = vqmovun_s32(v_Cr);
+            v_dst.val[2] = vqmovun_s32(v_Cb);
+
+            vst3_u16(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+            dst[i] = saturate_cast<ushort>(Y);
+            dst[i+1] = saturate_cast<ushort>(Cr);
+            dst[i+2] = saturate_cast<ushort>(Cb);
+        }
+    }
+    int srccn, blueIdx, coeffs[5];
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
+};
+
+#endif
 
 template<typename _Tp> struct YCrCb2RGB_f
 {
@@ -923,6 +1367,80 @@ template<typename _Tp> struct YCrCb2RGB_f
     float coeffs[4];
 };
 
+#if CV_NEON
+
+template <>
+struct YCrCb2RGB_f<float>
+{
+    typedef float channel_type;
+
+    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = vdupq_n_f32(coeffs[0]);
+        v_c1 = vdupq_n_f32(coeffs[1]);
+        v_c2 = vdupq_n_f32(coeffs[2]);
+        v_c3 = vdupq_n_f32(coeffs[3]);
+        v_delta = vdupq_n_f32(ColorChannel<float>::half());
+        v_alpha = vdupq_n_f32(ColorChannel<float>::max());
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        if (dcn == 3)
+            for ( ; i <= n - 12; i += 12, dst += 12)
+            {
+                float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
+                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+
+                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
+                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
+                v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
+
+                vst3q_f32(dst, v_dst);
+            }
+        else
+            for ( ; i <= n - 12; i += 12, dst += 16)
+            {
+                float32x4x3_t v_src = vld3q_f32(src + i);
+                float32x4x4_t v_dst;
+                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+
+                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
+                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
+                v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
+                v_dst.val[3] = v_alpha;
+
+                vst4q_f32(dst, v_dst);
+            }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
+
+            float b = Y + (Cb - delta)*C3;
+            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
+            float r = Y + (Cr - delta)*C0;
+
+            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    float coeffs[4];
+    float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
+};
+
+#endif
 
 template<typename _Tp> struct YCrCb2RGB_i
 {
@@ -962,20 +1480,268 @@ template<typename _Tp> struct YCrCb2RGB_i
     int coeffs[4];
 };
 
+#if CV_NEON
 
-////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
-
-static const float sRGB2XYZ_D65[] =
+template <>
+struct YCrCb2RGB_i<uchar>
 {
-    0.412453f, 0.357580f, 0.180423f,
-    0.212671f, 0.715160f, 0.072169f,
-    0.019334f, 0.119193f, 0.950227f
-};
+    typedef uchar channel_type;
 
-static const float XYZ2sRGB_D65[] =
-{
-    3.240479f, -1.53715f, -0.498535f,
-    -0.969256f, 1.875991f, 0.041556f,
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {22987, -11698, -5636, 29049};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_delta = vdup_n_s16(ColorChannel<uchar>::half());
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + i);
+            int16x8x3_t v_src16;
+            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+
+            int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
+                      v_Cr = vget_low_s16(v_src16.val[1]),
+                      v_Cb = vget_low_s16(v_src16.val[2]);
+
+            int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
+            v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
+            v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
+            v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
+
+            v_Y = vget_high_s16(v_src16.val[0]);
+            v_Cr = vget_high_s16(v_src16.val[1]);
+            v_Cb = vget_high_s16(v_src16.val[2]);
+
+            int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
+            v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
+            v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
+            v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
+
+            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
+            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
+            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
+
+            if (dcn == 3)
+            {
+                uint8x8x3_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                vst3_u8(dst, v_dst);
+            }
+            else
+            {
+                uint8x8x4_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4_u8(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            uchar Y = src[i];
+            uchar Cr = src[i+1];
+            uchar Cb = src[i+2];
+
+            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+            dst[bidx] = saturate_cast<uchar>(b);
+            dst[1] = saturate_cast<uchar>(g);
+            dst[bidx^2] = saturate_cast<uchar>(r);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[4];
+
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
+    int16x4_t v_delta;
+    uint8x8_t v_alpha;
+};
+
+template <>
+struct YCrCb2RGB_i<ushort>
+{
+    typedef ushort channel_type;
+
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {22987, -11698, -5636, 29049};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
+        v_alpha2 = vget_low_u16(v_alpha);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint16x8x3_t v_src = vld3q_u16(src + i);
+
+            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
+                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
+                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+            int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+            v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+            v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
+            v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
+
+            v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
+            v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
+            v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+            int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+            v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+            v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
+            v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
+
+            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
+            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
+            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
+
+            if (dcn == 3)
+            {
+                uint16x8x3_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                vst3q_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x8x4_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4q_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+        {
+            uint16x4x3_t v_src = vld3_u16(src + i);
+
+            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
+                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
+                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+
+            int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+            v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+            v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
+            v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
+
+            uint16x4_t v_bd = vqmovun_s32(v_b);
+            uint16x4_t v_gd = vqmovun_s32(v_g);
+            uint16x4_t v_rd = vqmovun_s32(v_r);
+
+            if (dcn == 3)
+            {
+                uint16x4x3_t v_dst;
+                v_dst.val[bidx] = v_bd;
+                v_dst.val[1] = v_gd;
+                v_dst.val[bidx^2] = v_rd;
+                vst3_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x4x4_t v_dst;
+                v_dst.val[bidx] = v_bd;
+                v_dst.val[1] = v_gd;
+                v_dst.val[bidx^2] = v_rd;
+                v_dst.val[3] = v_alpha2;
+                vst4_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            ushort Y = src[i];
+            ushort Cr = src[i+1];
+            ushort Cb = src[i+2];
+
+            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+            dst[bidx] = saturate_cast<ushort>(b);
+            dst[1] = saturate_cast<ushort>(g);
+            dst[bidx^2] = saturate_cast<ushort>(r);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[4];
+
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
+    uint16x8_t v_alpha;
+    uint16x4_t v_alpha2;
+};
+
+#endif
+
+////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
+
+static const float sRGB2XYZ_D65[] =
+{
+    0.412453f, 0.357580f, 0.180423f,
+    0.212671f, 0.715160f, 0.072169f,
+    0.019334f, 0.119193f, 0.950227f
+};
+
+static const float XYZ2sRGB_D65[] =
+{
+    3.240479f, -1.53715f, -0.498535f,
+    -0.969256f, 1.875991f, 0.041556f,
     0.055648f, -0.204043f, 1.057311f
 };
 
@@ -1013,6 +1779,78 @@ template<typename _Tp> struct RGB2XYZ_f
     float coeffs[9];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2XYZ_f<float>
+{
+    typedef float channel_type;
+
+    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = vdupq_n_f32(coeffs[0]);
+        v_c1 = vdupq_n_f32(coeffs[1]);
+        v_c2 = vdupq_n_f32(coeffs[2]);
+        v_c3 = vdupq_n_f32(coeffs[3]);
+        v_c4 = vdupq_n_f32(coeffs[4]);
+        v_c5 = vdupq_n_f32(coeffs[5]);
+        v_c6 = vdupq_n_f32(coeffs[6]);
+        v_c7 = vdupq_n_f32(coeffs[7]);
+        v_c8 = vdupq_n_f32(coeffs[8]);
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+        n *= 3;
+
+        if (scn == 3)
+            for ( ; i <= n - 12; i += 12, src += 12)
+            {
+                float32x4x3_t v_src = vld3q_f32(src), v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
+                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
+                vst3q_f32(dst + i, v_dst);
+            }
+        else
+            for ( ; i <= n - 12; i += 12, src += 16)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
+                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
+                vst3q_f32(dst + i, v_dst);
+            }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
+            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
+            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
+            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
+        }
+    }
+
+    int srccn;
+    float coeffs[9];
+    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+};
+
+#endif
 
 template<typename _Tp> struct RGB2XYZ_i
 {
@@ -1055,6 +1893,247 @@ template<typename _Tp> struct RGB2XYZ_i
     int coeffs[9];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2XYZ_i<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const int coeffs0[] =
+        {
+            1689,    1465,    739,
+            871,     2929,    296,
+            79,      488,     3892
+        };
+        for( int i = 0; i < 9; i++ )
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = vdup_n_u16(coeffs[0]);
+        v_c1 = vdup_n_u16(coeffs[1]);
+        v_c2 = vdup_n_u16(coeffs[2]);
+        v_c3 = vdup_n_u16(coeffs[3]);
+        v_c4 = vdup_n_u16(coeffs[4]);
+        v_c5 = vdup_n_u16(coeffs[5]);
+        v_c6 = vdup_n_u16(coeffs[6]);
+        v_c7 = vdup_n_u16(coeffs[7]);
+        v_c8 = vdup_n_u16(coeffs[8]);
+        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
+    }
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint8x8x3_t v_dst;
+            uint16x8x3_t v_src16;
+
+            if (scn == 3)
+            {
+                uint8x8x3_t v_src = vld3_u8(src);
+                v_src16.val[0] = vmovl_u8(v_src.val[0]);
+                v_src16.val[1] = vmovl_u8(v_src.val[1]);
+                v_src16.val[2] = vmovl_u8(v_src.val[2]);
+            }
+            else
+            {
+                uint8x8x4_t v_src = vld4_u8(src);
+                v_src16.val[0] = vmovl_u8(v_src.val[0]);
+                v_src16.val[1] = vmovl_u8(v_src.val[1]);
+                v_src16.val[2] = vmovl_u8(v_src.val[2]);
+            }
+
+            uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
+                       v_s1 = vget_low_u16(v_src16.val[1]),
+                       v_s2 = vget_low_u16(v_src16.val[2]);
+
+            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vget_high_u16(v_src16.val[0]),
+            v_s1 = vget_high_u16(v_src16.val[1]),
+            v_s2 = vget_high_u16(v_src16.val[2]);
+
+            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+
+            v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
+            v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
+            v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
+
+            vst3_u8(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
+            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
+            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
+            dst[i] = saturate_cast<uchar>(X);
+            dst[i+1] = saturate_cast<uchar>(Y);
+            dst[i+2] = saturate_cast<uchar>(Z);
+        }
+    }
+
+    int srccn, coeffs[9];
+    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2XYZ_i<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const int coeffs0[] =
+        {
+            1689,    1465,    739,
+            871,     2929,    296,
+            79,      488,     3892
+        };
+        for( int i = 0; i < 9; i++ )
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = vdup_n_u16(coeffs[0]);
+        v_c1 = vdup_n_u16(coeffs[1]);
+        v_c2 = vdup_n_u16(coeffs[2]);
+        v_c3 = vdup_n_u16(coeffs[3]);
+        v_c4 = vdup_n_u16(coeffs[4]);
+        v_c5 = vdup_n_u16(coeffs[5]);
+        v_c6 = vdup_n_u16(coeffs[6]);
+        v_c7 = vdup_n_u16(coeffs[7]);
+        v_c8 = vdup_n_u16(coeffs[8]);
+        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
+    }
+
+    void operator()(const ushort * src, ushort * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint16x8x3_t v_src, v_dst;
+
+            if (scn == 3)
+                v_src = vld3q_u16(src);
+            else
+            {
+                uint16x8x4_t v_src4 = vld4q_u16(src);
+                v_src.val[0] = v_src4.val[0];
+                v_src.val[1] = v_src4.val[1];
+                v_src.val[2] = v_src4.val[2];
+            }
+
+            uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
+                       v_s1 = vget_low_u16(v_src.val[1]),
+                       v_s2 = vget_low_u16(v_src.val[2]);
+
+            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vget_high_u16(v_src.val[0]),
+            v_s1 = vget_high_u16(v_src.val[1]),
+            v_s2 = vget_high_u16(v_src.val[2]);
+
+            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+
+            v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
+            v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
+            v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
+
+            vst3q_u16(dst + i, v_dst);
+        }
+
+        for ( ; i <= n - 12; i += 12, src += scn * 4)
+        {
+            uint16x4x3_t v_dst;
+            uint16x4_t v_s0, v_s1, v_s2;
+
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_s0 = v_src.val[0];
+                v_s1 = v_src.val[1];
+                v_s2 = v_src.val[2];
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_s0 = v_src.val[0];
+                v_s1 = v_src.val[1];
+                v_s2 = v_src.val[2];
+            }
+
+            uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+
+            v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
+            v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
+            v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
+
+            vst3_u16(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
+            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
+            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
+            dst[i] = saturate_cast<ushort>(X);
+            dst[i+1] = saturate_cast<ushort>(Y);
+            dst[i+2] = saturate_cast<ushort>(Z);
+        }
+    }
+
+    int srccn, coeffs[9];
+    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    uint32x4_t v_delta;
+};
+
+#endif
 
 template<typename _Tp> struct XYZ2RGB_f
 {
@@ -1141,6 +2220,278 @@ template<typename _Tp> struct XYZ2RGB_i
     int coeffs[9];
 };
 
+#if CV_NEON
+
+template <>
+struct XYZ2RGB_i<uchar>
+{
+    typedef uchar channel_type;
+
+    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+    : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] =
+        {
+            13273,  -6296,  -2042,
+            -3970,   7684,    170,
+              228,   -836,   4331
+        };
+        for(int i = 0; i < 9; i++)
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[6]);
+            std::swap(coeffs[1], coeffs[7]);
+            std::swap(coeffs[2], coeffs[8]);
+        }
+
+        v_c0 = vdup_n_s16(coeffs[0]);
+        v_c1 = vdup_n_s16(coeffs[1]);
+        v_c2 = vdup_n_s16(coeffs[2]);
+        v_c3 = vdup_n_s16(coeffs[3]);
+        v_c4 = vdup_n_s16(coeffs[4]);
+        v_c5 = vdup_n_s16(coeffs[5]);
+        v_c6 = vdup_n_s16(coeffs[6]);
+        v_c7 = vdup_n_s16(coeffs[7]);
+        v_c8 = vdup_n_s16(coeffs[8]);
+        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
+        v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int dcn = dstcn, i = 0;
+        uchar alpha = ColorChannel<uchar>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + i);
+            int16x8x3_t v_src16;
+            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+
+            int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
+                       v_s1 = vget_low_s16(v_src16.val[1]),
+                       v_s2 = vget_low_s16(v_src16.val[2]);
+
+            int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vget_high_s16(v_src16.val[0]),
+            v_s1 = vget_high_s16(v_src16.val[1]),
+            v_s2 = vget_high_s16(v_src16.val[2]);
+
+            int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+
+            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
+            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
+            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
+
+            if (dcn == 3)
+            {
+                uint8x8x3_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                vst3_u8(dst, v_dst);
+            }
+            else
+            {
+                uint8x8x4_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4_u8(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
+            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
+            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+            dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
+            dst[2] = saturate_cast<uchar>(R);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[9];
+
+    int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    uint8x8_t v_alpha;
+    int32x4_t v_delta;
+};
+
+template <>
+struct XYZ2RGB_i<ushort>
+{
+    typedef ushort channel_type;
+
+    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+    : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] =
+        {
+            13273,  -6296,  -2042,
+            -3970,   7684,    170,
+              228,   -836,   4331
+        };
+        for(int i = 0; i < 9; i++)
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[6]);
+            std::swap(coeffs[1], coeffs[7]);
+            std::swap(coeffs[2], coeffs[8]);
+        }
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_c4 = vdupq_n_s32(coeffs[4]);
+        v_c5 = vdupq_n_s32(coeffs[5]);
+        v_c6 = vdupq_n_s32(coeffs[6]);
+        v_c7 = vdupq_n_s32(coeffs[7]);
+        v_c8 = vdupq_n_s32(coeffs[8]);
+        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
+        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
+        v_alpha2 = vget_low_u16(v_alpha);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int dcn = dstcn, i = 0;
+        ushort alpha = ColorChannel<ushort>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint16x8x3_t v_src = vld3q_u16(src + i);
+            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
+                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
+                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+            int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
+            v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
+            v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+            int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+
+            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
+            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
+            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
+
+            if (dcn == 3)
+            {
+                uint16x8x3_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                vst3q_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x8x4_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4q_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+        {
+            uint16x4x3_t v_src = vld3_u16(src + i);
+            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
+                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
+                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+
+            int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
+            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
+            v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
+
+            uint16x4_t v_b = vqmovun_s32(v_X);
+            uint16x4_t v_g = vqmovun_s32(v_Y);
+            uint16x4_t v_r = vqmovun_s32(v_Z);
+
+            if (dcn == 3)
+            {
+                uint16x4x3_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                vst3_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x4x4_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                v_dst.val[3] = v_alpha2;
+                vst4_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
+            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
+            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+            dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
+            dst[2] = saturate_cast<ushort>(R);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[9];
+
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
+    uint16x4_t v_alpha2;
+    uint16x8_t v_alpha;
+};
+
+#endif
 
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
 
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 861b7ae..f4c2cf2 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -3580,14 +3580,6 @@ public:
                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
                                 }
                             }
-                        #elif CV_NEON
-                            for( ; x1 <= bcols - 4; x1 += 4 )
-                            {
-                                int32x4_t v_sx = cv_vrndq_s32_f32(vld1q_f32(sX + x1)),
-                                          v_sy = cv_vrndq_s32_f32(vld1q_f32(sY + x1));
-                                int16x4x2_t v_dst = vzip_s16(vqmovn_s32(v_sx), vqmovn_s32(v_sy));
-                                vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
-                            }
                         #endif
 
                             for( ; x1 < bcols; x1++ )
-- 
2.7.4