modules/imgproc/src/color.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 /********************************* COPYRIGHT NOTICE *******************************\
  44   The function for RGB to Lab conversion is based on the MATLAB script
  45   RGB2Lab.m translated by Mark Ruzon from C code by Yossi Rubner, 23 September 1997.
  46   See the page [http://vision.stanford.edu/~ruzon/software/rgblab.html]
  47 \**********************************************************************************/
  48
  49 /********************************* COPYRIGHT NOTICE *******************************\
  50   Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
  51   from MD-Mathematische Dienste GmbH. Below is the copyright notice:
  52
  53     IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  54     By downloading, copying, installing or using the software you agree
  55     to this license. If you do not agree to this license, do not download,
  56     install, copy or use the software.
  57
  58     Contributors License Agreement:
  59
  60       Copyright (c) 2002,
  61       MD-Mathematische Dienste GmbH
  62       Im Defdahl 5-10
  63       44141 Dortmund
  64       Germany
  65       www.md-it.de
  66
  67     Redistribution and use in source and binary forms,
  68     with or without modification, are permitted provided
  69     that the following conditions are met:
  70
  71     Redistributions of source code must retain
  72     the above copyright notice, this list of conditions and the following disclaimer.
  73     Redistributions in binary form must reproduce the above copyright notice,
  74     this list of conditions and the following disclaimer in the documentation
  75     and/or other materials provided with the distribution.
  76     The name of Contributor may not be used to endorse or promote products
  77     derived from this software without specific prior written permission.
  78
  79     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  80     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  81     THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  82     PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
  83     FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  84     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  85     OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  86     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  87     STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  88     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  89     THE POSSIBILITY OF SUCH DAMAGE.
  90 \**********************************************************************************/
  91
  92 #include "precomp.hpp"
  93 #include "opencl_kernels_imgproc.hpp"
  94 #include <limits>
  95
  96 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
  97
  98 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
  99 #define MAX_IPP8u   255
 100 #define MAX_IPP16u  65535
 101 #define MAX_IPP32f  1.0
 102 static IppStatus sts = ippInit();
 103 #endif
 104
 105 namespace cv
 106 {
 107
 108 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
 109 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
 110 {
 111     _Tp cn = 0;
 112     int i;
 113     tab[0] = tab[1] = (_Tp)0;
 114
 115     for(i = 1; i < n-1; i++)
 116     {
 117         _Tp t = 3*(f[i+1] - 2*f[i] + f[i-1]);
 118         _Tp l = 1/(4 - tab[(i-1)*4]);
 119         tab[i*4] = l; tab[i*4+1] = (t - tab[(i-1)*4+1])*l;
 120     }
 121
 122     for(i = n-1; i >= 0; i--)
 123     {
 124         _Tp c = tab[i*4+1] - tab[i*4]*cn;
 125         _Tp b = f[i+1] - f[i] - (cn + c*2)*(_Tp)0.3333333333333333;
 126         _Tp d = (cn - c)*(_Tp)0.3333333333333333;
 127         tab[i*4] = f[i]; tab[i*4+1] = b;
 128         tab[i*4+2] = c; tab[i*4+3] = d;
 129         cn = c;
 130     }
 131 }
 132
 133 // interpolates value of a function at x, 0 <= x <= n using a cubic spline.
 134 template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n)
 135 {
 136     // don't touch this function without urgent need - some versions of gcc fail to inline it correctly
 137     int ix = std::min(std::max(int(x), 0), n-1);
 138     x -= ix;
 139     tab += ix*4;
 140     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
 141 }
 142
 143
 144 template<typename _Tp> struct ColorChannel
 145 {
 146     typedef float worktype_f;
 147     static _Tp max() { return std::numeric_limits<_Tp>::max(); }
 148     static _Tp half() { return (_Tp)(max()/2 + 1); }
 149 };
 150
 151 template<> struct ColorChannel<float>
 152 {
 153     typedef float worktype_f;
 154     static float max() { return 1.f; }
 155     static float half() { return 0.5f; }
 156 };
 157
 158 /*template<> struct ColorChannel<double>
 159 {
 160     typedef double worktype_f;
 161     static double max() { return 1.; }
 162     static double half() { return 0.5; }
 163 };*/
 164
 165
 166 ///////////////////////////// Top-level template function ////////////////////////////////
 167
 168 template <typename Cvt>
 169 class CvtColorLoop_Invoker : public ParallelLoopBody
 170 {
 171     typedef typename Cvt::channel_type _Tp;
 172 public:
 173
 174     CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
 175         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
 176     {
 177     }
 178
 179     virtual void operator()(const Range& range) const
 180     {
 181         const uchar* yS = src.ptr<uchar>(range.start);
 182         uchar* yD = dst.ptr<uchar>(range.start);
 183
 184         for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
 185             cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
 186     }
 187
 188 private:
 189     const Mat& src;
 190     Mat& dst;
 191     const Cvt& cvt;
 192
 193     const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
 194 };
 195
 196 template <typename Cvt>
 197 void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
 198 {
 199     parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
 200 }
 201
 202 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 203
 204 typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
 205 typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
 206 typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
 207
 208 template <typename Cvt>
 209 class CvtColorIPPLoop_Invoker :
 210         public ParallelLoopBody
 211 {
 212 public:
 213
 214     CvtColorIPPLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt, bool *_ok) :
 215         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt), ok(_ok)
 216     {
 217         *ok = true;
 218     }
 219
 220     virtual void operator()(const Range& range) const
 221     {
 222         const void *yS = src.ptr<uchar>(range.start);
 223         void *yD = dst.ptr<uchar>(range.start);
 224         if( !cvt(yS, (int)src.step[0], yD, (int)dst.step[0], src.cols, range.end - range.start) )
 225             *ok = false;
 226         else
 227         {
 228             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
 229         }
 230     }
 231
 232 private:
 233     const Mat& src;
 234     Mat& dst;
 235     const Cvt& cvt;
 236     bool *ok;
 237
 238     const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
 239 };
 240
 241 template <typename Cvt>
 242 bool CvtColorIPPLoop(const Mat& src, Mat& dst, const Cvt& cvt)
 243 {
 244     bool ok;
 245     parallel_for_(Range(0, src.rows), CvtColorIPPLoop_Invoker<Cvt>(src, dst, cvt, &ok), src.total()/(double)(1<<16) );
 246     return ok;
 247 }
 248
 249 template <typename Cvt>
 250 bool CvtColorIPPLoopCopy(Mat& src, Mat& dst, const Cvt& cvt)
 251 {
 252     Mat temp;
 253     Mat &source = src;
 254     if( src.data == dst.data )
 255     {
 256         src.copyTo(temp);
 257         source = temp;
 258     }
 259     bool ok;
 260     parallel_for_(Range(0, source.rows), CvtColorIPPLoop_Invoker<Cvt>(source, dst, cvt, &ok),
 261                   source.total()/(double)(1<<16) );
 262     return ok;
 263 }
 264
 265 static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
 266          IppiSize roiSize, const int *dstOrder)
 267 {
 268     return ippiSwapChannels_8u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
 269 }
 270
 271 static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
 272          IppiSize roiSize, const int *dstOrder)
 273 {
 274     return ippiSwapChannels_16u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
 275 }
 276
 277 static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
 278          IppiSize roiSize, const int *dstOrder)
 279 {
 280     return ippiSwapChannels_32f_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
 281 }
 282
 283 static ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
 284 {
 285     (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
 286     0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
 287 };
 288
 289 static ippiGeneralFunc ippiCopyAC4C3RTab[] =
 290 {
 291     (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
 292     0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
 293 };
 294
 295 static ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
 296 {
 297     (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
 298     0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
 299 };
 300
 301 static ippiReorderFunc ippiSwapChannelsC3RTab[] =
 302 {
 303     (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
 304     0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
 305 };
 306
 307 #if IPP_VERSION_X100 >= 801
 308 static ippiReorderFunc ippiSwapChannelsC4RTab[] =
 309 {
 310     (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
 311     0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
 312 };
 313 #endif
 314
 315 static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
 316 {
 317     (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
 318     0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
 319 };
 320
 321 static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
 322 {
 323     (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
 324     0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
 325 };
 326
 327 static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
 328 {
 329     (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
 330     0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
 331 };
 332
 333 static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
 334 {
 335     (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
 336     0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
 337 };
 338
 339 static ippiGeneralFunc ippiCopyP3C3RTab[] =
 340 {
 341     (ippiGeneralFunc)ippiCopy_8u_P3C3R, 0, (ippiGeneralFunc)ippiCopy_16u_P3C3R, 0,
 342     0, (ippiGeneralFunc)ippiCopy_32f_P3C3R, 0, 0
 343 };
 344
 345 static ippiGeneralFunc ippiRGB2XYZTab[] =
 346 {
 347     (ippiGeneralFunc)ippiRGBToXYZ_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToXYZ_16u_C3R, 0,
 348     0, (ippiGeneralFunc)ippiRGBToXYZ_32f_C3R, 0, 0
 349 };
 350
 351 static ippiGeneralFunc ippiXYZ2RGBTab[] =
 352 {
 353     (ippiGeneralFunc)ippiXYZToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiXYZToRGB_16u_C3R, 0,
 354     0, (ippiGeneralFunc)ippiXYZToRGB_32f_C3R, 0, 0
 355 };
 356
 357 static ippiGeneralFunc ippiRGB2HSVTab[] =
 358 {
 359     (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
 360     0, 0, 0, 0
 361 };
 362
 363 static ippiGeneralFunc ippiHSV2RGBTab[] =
 364 {
 365     (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
 366     0, 0, 0, 0
 367 };
 368
 369 static ippiGeneralFunc ippiRGB2HLSTab[] =
 370 {
 371     (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
 372     0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
 373 };
 374
 375 static ippiGeneralFunc ippiHLS2RGBTab[] =
 376 {
 377     (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
 378     0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
 379 };
 380
 381 #if !defined(HAVE_IPP_ICV_ONLY) && 0
 382 static ippiGeneralFunc ippiRGBToLUVTab[] =
 383 {
 384     (ippiGeneralFunc)ippiRGBToLUV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToLUV_16u_C3R, 0,
 385     0, (ippiGeneralFunc)ippiRGBToLUV_32f_C3R, 0, 0
 386 };
 387
 388 static ippiGeneralFunc ippiLUVToRGBTab[] =
 389 {
 390     (ippiGeneralFunc)ippiLUVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiLUVToRGB_16u_C3R, 0,
 391     0, (ippiGeneralFunc)ippiLUVToRGB_32f_C3R, 0, 0
 392 };
 393 #endif
 394
 395 struct IPPGeneralFunctor
 396 {
 397     IPPGeneralFunctor(ippiGeneralFunc _func) : func(_func){}
 398     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 399     {
 400         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
 401     }
 402 private:
 403     ippiGeneralFunc func;
 404 };
 405
 406 struct IPPReorderFunctor
 407 {
 408     IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : func(_func)
 409     {
 410         order[0] = _order0;
 411         order[1] = _order1;
 412         order[2] = _order2;
 413         order[3] = 3;
 414     }
 415     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 416     {
 417         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
 418     }
 419 private:
 420     ippiReorderFunc func;
 421     int order[4];
 422 };
 423
 424 struct IPPColor2GrayFunctor
 425 {
 426     IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
 427         func(_func)
 428     {
 429         coeffs[0] = 0.114f;
 430         coeffs[1] = 0.587f;
 431         coeffs[2] = 0.299f;
 432     }
 433     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 434     {
 435         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
 436     }
 437 private:
 438     ippiColor2GrayFunc func;
 439     Ipp32f coeffs[3];
 440 };
 441
 442 struct IPPGray2BGRFunctor
 443 {
 444     IPPGray2BGRFunctor(ippiGeneralFunc _func) :
 445         func(_func)
 446     {
 447     }
 448
 449     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 450     {
 451         if (func == 0)
 452             return false;
 453
 454         const void* srcarray[3] = { src, src, src };
 455         return func(srcarray, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0;
 456     }
 457 private:
 458     ippiGeneralFunc func;
 459 };
 460
 461 struct IPPGray2BGRAFunctor
 462 {
 463     IPPGray2BGRAFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _depth) :
 464         func1(_func1), func2(_func2), depth(_depth)
 465     {
 466     }
 467
 468     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 469     {
 470         if (func1 == 0 || func2 == 0)
 471             return false;
 472
 473         const void* srcarray[3] = { src, src, src };
 474         Mat temp(rows, cols, CV_MAKETYPE(depth, 3));
 475         if(func1(srcarray, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
 476             return false;
 477         int order[4] = {0, 1, 2, 3};
 478         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
 479     }
 480 private:
 481     ippiGeneralFunc func1;
 482     ippiReorderFunc func2;
 483     int depth;
 484 };
 485
 486 struct IPPReorderGeneralFunctor
 487 {
 488     IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
 489         func1(_func1), func2(_func2), depth(_depth)
 490     {
 491         order[0] = _order0;
 492         order[1] = _order1;
 493         order[2] = _order2;
 494         order[3] = 3;
 495     }
 496     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 497     {
 498         if (func1 == 0 || func2 == 0)
 499             return false;
 500
 501         Mat temp;
 502         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
 503         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
 504             return false;
 505         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
 506     }
 507 private:
 508     ippiReorderFunc func1;
 509     ippiGeneralFunc func2;
 510     int order[4];
 511     int depth;
 512 };
 513
 514 struct IPPGeneralReorderFunctor
 515 {
 516     IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
 517         func1(_func1), func2(_func2), depth(_depth)
 518     {
 519         order[0] = _order0;
 520         order[1] = _order1;
 521         order[2] = _order2;
 522         order[3] = 3;
 523     }
 524     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 525     {
 526         if (func1 == 0 || func2 == 0)
 527             return false;
 528
 529         Mat temp;
 530         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
 531         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
 532             return false;
 533         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
 534     }
 535 private:
 536     ippiGeneralFunc func1;
 537     ippiReorderFunc func2;
 538     int order[4];
 539     int depth;
 540 };
 541
 542 #endif
 543
 544 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
 545
 546 template<typename _Tp> struct RGB2RGB
 547 {
 548     typedef _Tp channel_type;
 549
 550     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
 551     void operator()(const _Tp* src, _Tp* dst, int n) const
 552     {
 553         int scn = srccn, dcn = dstcn, bidx = blueIdx;
 554         if( dcn == 3 )
 555         {
 556             n *= 3;
 557             for( int i = 0; i < n; i += 3, src += scn )
 558             {
 559                 _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
 560                 dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
 561             }
 562         }
 563         else if( scn == 3 )
 564         {
 565             n *= 3;
 566             _Tp alpha = ColorChannel<_Tp>::max();
 567             for( int i = 0; i < n; i += 3, dst += 4 )
 568             {
 569                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
 570                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
 571             }
 572         }
 573         else
 574         {
 575             n *= 4;
 576             for( int i = 0; i < n; i += 4 )
 577             {
 578                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
 579                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
 580             }
 581         }
 582     }
 583
 584     int srccn, dstcn, blueIdx;
 585 };
 586
 587 #if CV_NEON
 588
 589 template<> struct RGB2RGB<uchar>
 590 {
 591     typedef uchar channel_type;
 592
 593     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
 594         srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
 595     {
 596         v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
 597         v_alpha2 = vget_low_u8(v_alpha);
 598     }
 599
 600     void operator()(const uchar * src, uchar * dst, int n) const
 601     {
 602         int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
 603         if (dcn == 3)
 604         {
 605             n *= 3;
 606             if (scn == 3)
 607             {
 608                 for ( ; i <= n - 48; i += 48, src += 48 )
 609                 {
 610                     uint8x16x3_t v_src = vld3q_u8(src), v_dst;
 611                     v_dst.val[0] = v_src.val[bidx];
 612                     v_dst.val[1] = v_src.val[1];
 613                     v_dst.val[2] = v_src.val[bidx ^ 2];
 614                     vst3q_u8(dst + i, v_dst);
 615                 }
 616                 for ( ; i <= n - 24; i += 24, src += 24 )
 617                 {
 618                     uint8x8x3_t v_src = vld3_u8(src), v_dst;
 619                     v_dst.val[0] = v_src.val[bidx];
 620                     v_dst.val[1] = v_src.val[1];
 621                     v_dst.val[2] = v_src.val[bidx ^ 2];
 622                     vst3_u8(dst + i, v_dst);
 623                 }
 624                 for ( ; i < n; i += 3, src += 3 )
 625                 {
 626                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
 627                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
 628                 }
 629             }
 630             else
 631             {
 632                 for ( ; i <= n - 48; i += 48, src += 64 )
 633                 {
 634                     uint8x16x4_t v_src = vld4q_u8(src);
 635                     uint8x16x3_t v_dst;
 636                     v_dst.val[0] = v_src.val[bidx];
 637                     v_dst.val[1] = v_src.val[1];
 638                     v_dst.val[2] = v_src.val[bidx ^ 2];
 639                     vst3q_u8(dst + i, v_dst);
 640                 }
 641                 for ( ; i <= n - 24; i += 24, src += 32 )
 642                 {
 643                     uint8x8x4_t v_src = vld4_u8(src);
 644                     uint8x8x3_t v_dst;
 645                     v_dst.val[0] = v_src.val[bidx];
 646                     v_dst.val[1] = v_src.val[1];
 647                     v_dst.val[2] = v_src.val[bidx ^ 2];
 648                     vst3_u8(dst + i, v_dst);
 649                 }
 650                 for ( ; i < n; i += 3, src += 4 )
 651                 {
 652                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
 653                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
 654                 }
 655             }
 656         }
 657         else if (scn == 3)
 658         {
 659             n *= 3;
 660             for ( ; i <= n - 48; i += 48, dst += 64 )
 661             {
 662                 uint8x16x3_t v_src = vld3q_u8(src + i);
 663                 uint8x16x4_t v_dst;
 664                 v_dst.val[bidx] = v_src.val[0];
 665                 v_dst.val[1] = v_src.val[1];
 666                 v_dst.val[bidx ^ 2] = v_src.val[2];
 667                 v_dst.val[3] = v_alpha;
 668                 vst4q_u8(dst, v_dst);
 669             }
 670             for ( ; i <= n - 24; i += 24, dst += 32 )
 671             {
 672                 uint8x8x3_t v_src = vld3_u8(src + i);
 673                 uint8x8x4_t v_dst;
 674                 v_dst.val[bidx] = v_src.val[0];
 675                 v_dst.val[1] = v_src.val[1];
 676                 v_dst.val[bidx ^ 2] = v_src.val[2];
 677                 v_dst.val[3] = v_alpha2;
 678                 vst4_u8(dst, v_dst);
 679             }
 680             uchar alpha = ColorChannel<uchar>::max();
 681             for (; i < n; i += 3, dst += 4 )
 682             {
 683                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
 684                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
 685             }
 686         }
 687         else
 688         {
 689             n *= 4;
 690             for ( ; i <= n - 64; i += 64 )
 691             {
 692                 uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
 693                 v_dst.val[0] = v_src.val[2];
 694                 v_dst.val[1] = v_src.val[1];
 695                 v_dst.val[2] = v_src.val[0];
 696                 v_dst.val[3] = v_src.val[3];
 697                 vst4q_u8(dst + i, v_dst);
 698             }
 699             for ( ; i <= n - 32; i += 32 )
 700             {
 701                 uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
 702                 v_dst.val[0] = v_src.val[2];
 703                 v_dst.val[1] = v_src.val[1];
 704                 v_dst.val[2] = v_src.val[0];
 705                 v_dst.val[3] = v_src.val[3];
 706                 vst4_u8(dst + i, v_dst);
 707             }
 708             for ( ; i < n; i += 4)
 709             {
 710                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
 711                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
 712             }
 713         }
 714     }
 715
 716     int srccn, dstcn, blueIdx;
 717
 718     uint8x16_t v_alpha;
 719     uint8x8_t v_alpha2;
 720 };
 721
 722 #endif
 723
 724 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
 725
 726 struct RGB5x52RGB
 727 {
 728     typedef uchar channel_type;
 729
 730     RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
 731         : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
 732     {
 733         #if CV_NEON
 734         v_n3 = vdupq_n_u16(~3);
 735         v_n7 = vdupq_n_u16(~7);
 736         v_255 = vdupq_n_u8(255);
 737         v_0 = vdupq_n_u8(0);
 738         v_mask = vdupq_n_u16(0x8000);
 739         #endif
 740     }
 741
 742     void operator()(const uchar* src, uchar* dst, int n) const
 743     {
 744         int dcn = dstcn, bidx = blueIdx, i = 0;
 745         if( greenBits == 6 )
 746         {
 747             #if CV_NEON
 748             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
 749             {
 750                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
 751                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
 752                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
 753                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
 754                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
 755                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
 756                 if (dcn == 3)
 757                 {
 758                     uint8x16x3_t v_dst;
 759                     v_dst.val[bidx] = v_b;
 760                     v_dst.val[1] = v_g;
 761                     v_dst.val[bidx^2] = v_r;
 762                     vst3q_u8(dst, v_dst);
 763                 }
 764                 else
 765                 {
 766                     uint8x16x4_t v_dst;
 767                     v_dst.val[bidx] = v_b;
 768                     v_dst.val[1] = v_g;
 769                     v_dst.val[bidx^2] = v_r;
 770                     v_dst.val[3] = v_255;
 771                     vst4q_u8(dst, v_dst);
 772                 }
 773             }
 774             #endif
 775             for( ; i < n; i++, dst += dcn )
 776             {
 777                 unsigned t = ((const ushort*)src)[i];
 778                 dst[bidx] = (uchar)(t << 3);
 779                 dst[1] = (uchar)((t >> 3) & ~3);
 780                 dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
 781                 if( dcn == 4 )
 782                     dst[3] = 255;
 783             }
 784         }
 785         else
 786         {
 787             #if CV_NEON
 788             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
 789             {
 790                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
 791                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
 792                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
 793                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
 794                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
 795                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
 796                 if (dcn == 3)
 797                 {
 798                     uint8x16x3_t v_dst;
 799                     v_dst.val[bidx] = v_b;
 800                     v_dst.val[1] = v_g;
 801                     v_dst.val[bidx^2] = v_r;
 802                     vst3q_u8(dst, v_dst);
 803                 }
 804                 else
 805                 {
 806                     uint8x16x4_t v_dst;
 807                     v_dst.val[bidx] = v_b;
 808                     v_dst.val[1] = v_g;
 809                     v_dst.val[bidx^2] = v_r;
 810                     v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
 811                                                         vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
 812                     vst4q_u8(dst, v_dst);
 813                 }
 814             }
 815             #endif
 816             for( ; i < n; i++, dst += dcn )
 817             {
 818                 unsigned t = ((const ushort*)src)[i];
 819                 dst[bidx] = (uchar)(t << 3);
 820                 dst[1] = (uchar)((t >> 2) & ~7);
 821                 dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
 822                 if( dcn == 4 )
 823                     dst[3] = t & 0x8000 ? 255 : 0;
 824             }
 825         }
 826     }
 827
 828     int dstcn, blueIdx, greenBits;
 829     #if CV_NEON
 830     uint16x8_t v_n3, v_n7, v_mask;
 831     uint8x16_t v_255, v_0;
 832     #endif
 833 };
 834
 835
 836 struct RGB2RGB5x5
 837 {
 838     typedef uchar channel_type;
 839
 840     RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
 841         : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
 842     {
 843         #if CV_NEON
 844         v_n3 = vdup_n_u8(~3);
 845         v_n7 = vdup_n_u8(~7);
 846         v_mask = vdupq_n_u16(0x8000);
 847         v_0 = vdupq_n_u16(0);
 848         v_full = vdupq_n_u16(0xffff);
 849         #endif
 850     }
 851
 852     void operator()(const uchar* src, uchar* dst, int n) const
 853     {
 854         int scn = srccn, bidx = blueIdx, i = 0;
 855         if (greenBits == 6)
 856         {
 857             if (scn == 3)
 858             {
 859                 #if CV_NEON
 860                 for ( ; i <= n - 8; i += 8, src += 24 )
 861                 {
 862                     uint8x8x3_t v_src = vld3_u8(src);
 863                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 864                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
 865                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
 866                     vst1q_u16((ushort *)dst + i, v_dst);
 867                 }
 868                 #endif
 869                 for ( ; i < n; i++, src += 3 )
 870                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
 871             }
 872             else
 873             {
 874                 #if CV_NEON
 875                 for ( ; i <= n - 8; i += 8, src += 32 )
 876                 {
 877                     uint8x8x4_t v_src = vld4_u8(src);
 878                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 879                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
 880                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
 881                     vst1q_u16((ushort *)dst + i, v_dst);
 882                 }
 883                 #endif
 884                 for ( ; i < n; i++, src += 4 )
 885                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
 886             }
 887         }
 888         else if (scn == 3)
 889         {
 890             #if CV_NEON
 891             for ( ; i <= n - 8; i += 8, src += 24 )
 892             {
 893                 uint8x8x3_t v_src = vld3_u8(src);
 894                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 895                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
 896                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
 897                 vst1q_u16((ushort *)dst + i, v_dst);
 898             }
 899             #endif
 900             for ( ; i < n; i++, src += 3 )
 901                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
 902         }
 903         else
 904         {
 905             #if CV_NEON
 906             for ( ; i <= n - 8; i += 8, src += 32 )
 907             {
 908                 uint8x8x4_t v_src = vld4_u8(src);
 909                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 910                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
 911                 v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
 912                                                    vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
 913                 vst1q_u16((ushort *)dst + i, v_dst);
 914             }
 915             #endif
 916             for ( ; i < n; i++, src += 4 )
 917                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
 918                     ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
 919         }
 920     }
 921
 922     int srccn, blueIdx, greenBits;
 923     #if CV_NEON
 924     uint8x8_t v_n3, v_n7;
 925     uint16x8_t v_mask, v_0, v_full;
 926     #endif
 927 };
 928
 929 ///////////////////////////////// Color to/from Grayscale ////////////////////////////////
 930
 931 template<typename _Tp>
 932 struct Gray2RGB
 933 {
 934     typedef _Tp channel_type;
 935
 936     Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
 937     void operator()(const _Tp* src, _Tp* dst, int n) const
 938     {
 939         if( dstcn == 3 )
 940             for( int i = 0; i < n; i++, dst += 3 )
 941             {
 942                 dst[0] = dst[1] = dst[2] = src[i];
 943             }
 944         else
 945         {
 946             _Tp alpha = ColorChannel<_Tp>::max();
 947             for( int i = 0; i < n; i++, dst += 4 )
 948             {
 949                 dst[0] = dst[1] = dst[2] = src[i];
 950                 dst[3] = alpha;
 951             }
 952         }
 953     }
 954
 955     int dstcn;
 956 };
 957
 958
 959 struct Gray2RGB5x5
 960 {
 961     typedef uchar channel_type;
 962
 963     Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
 964     {
 965         #if CV_NEON
 966         v_n7 = vdup_n_u8(~7);
 967         v_n3 = vdup_n_u8(~3);
 968         #endif
 969     }
 970
 971     void operator()(const uchar* src, uchar* dst, int n) const
 972     {
 973         int i = 0;
 974         if( greenBits == 6 )
 975         {
 976             #if CV_NEON
 977             for ( ; i <= n - 8; i += 8 )
 978             {
 979                 uint8x8_t v_src = vld1_u8(src + i);
 980                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
 981                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
 982                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
 983                 vst1q_u16((ushort *)dst + i, v_dst);
 984             }
 985             #endif
 986             for ( ; i < n; i++ )
 987             {
 988                 int t = src[i];
 989                 ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
 990             }
 991         }
 992         else
 993         {
 994             #if CV_NEON
 995             for ( ; i <= n - 8; i += 8 )
 996             {
 997                 uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
 998                 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
 999                 vst1q_u16((ushort *)dst + i, v_dst);
1000             }
1001             #endif
1002             for( ; i < n; i++ )
1003             {
1004                 int t = src[i] >> 3;
1005                 ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
1006             }
1007         }
1008     }
1009     int greenBits;
1010
1011     #if CV_NEON
1012     uint8x8_t v_n7, v_n3;
1013     #endif
1014 };
1015
1016
1017 #undef R2Y
1018 #undef G2Y
1019 #undef B2Y
1020
1021 enum
1022 {
1023     yuv_shift = 14,
1024     xyz_shift = 12,
1025     R2Y = 4899,
1026     G2Y = 9617,
1027     B2Y = 1868,
1028     BLOCK_SIZE = 256
1029 };
1030
1031
1032 struct RGB5x52Gray
1033 {
1034     typedef uchar channel_type;
1035
1036     RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
1037     {
1038         #if CV_NEON
1039         v_b2y = vdup_n_u16(B2Y);
1040         v_g2y = vdup_n_u16(G2Y);
1041         v_r2y = vdup_n_u16(R2Y);
1042         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1043         v_f8 = vdupq_n_u16(0xf8);
1044         v_fc = vdupq_n_u16(0xfc);
1045         #endif
1046     }
1047
1048     void operator()(const uchar* src, uchar* dst, int n) const
1049     {
1050         int i = 0;
1051         if( greenBits == 6 )
1052         {
1053             #if CV_NEON
1054             for ( ; i <= n - 8; i += 8)
1055             {
1056                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1057                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1058                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
1059                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
1060
1061                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1062                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1063                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1064                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1065                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1066                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1067
1068                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1069             }
1070             #endif
1071             for ( ; i < n; i++)
1072             {
1073                 int t = ((ushort*)src)[i];
1074                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1075                                            ((t >> 3) & 0xfc)*G2Y +
1076                                            ((t >> 8) & 0xf8)*R2Y, yuv_shift);
1077             }
1078         }
1079         else
1080         {
1081             #if CV_NEON
1082             for ( ; i <= n - 8; i += 8)
1083             {
1084                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1085                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1086                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
1087                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
1088
1089                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1090                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1091                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1092                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1093                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1094                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1095
1096                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1097             }
1098             #endif
1099             for ( ; i < n; i++)
1100             {
1101                 int t = ((ushort*)src)[i];
1102                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1103                                            ((t >> 2) & 0xf8)*G2Y +
1104                                            ((t >> 7) & 0xf8)*R2Y, yuv_shift);
1105             }
1106         }
1107     }
1108     int greenBits;
1109
1110     #if CV_NEON
1111     uint16x4_t v_b2y, v_g2y, v_r2y;
1112     uint32x4_t v_delta;
1113     uint16x8_t v_f8, v_fc;
1114     #endif
1115 };
1116
1117
1118 template<typename _Tp> struct RGB2Gray
1119 {
1120     typedef _Tp channel_type;
1121
1122     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1123     {
1124         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1125         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1126         if(blueIdx == 0)
1127             std::swap(coeffs[0], coeffs[2]);
1128     }
1129
1130     void operator()(const _Tp* src, _Tp* dst, int n) const
1131     {
1132         int scn = srccn;
1133         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1134         for(int i = 0; i < n; i++, src += scn)
1135             dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
1136     }
1137     int srccn;
1138     float coeffs[3];
1139 };
1140
1141 template<> struct RGB2Gray<uchar>
1142 {
1143     typedef uchar channel_type;
1144
1145     RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
1146     {
1147         const int coeffs0[] = { R2Y, G2Y, B2Y };
1148         if(!coeffs) coeffs = coeffs0;
1149
1150         int b = 0, g = 0, r = (1 << (yuv_shift-1));
1151         int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
1152
1153         for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
1154         {
1155             tab[i] = b;
1156             tab[i+256] = g;
1157             tab[i+512] = r;
1158         }
1159     }
1160     void operator()(const uchar* src, uchar* dst, int n) const
1161     {
1162         int scn = srccn;
1163         const int* _tab = tab;
1164         for(int i = 0; i < n; i++, src += scn)
1165             dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
1166     }
1167     int srccn;
1168     int tab[256*3];
1169 };
1170
1171 #if CV_NEON
1172
1173 template <>
1174 struct RGB2Gray<ushort>
1175 {
1176     typedef ushort channel_type;
1177
1178     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
1179         srccn(_srccn)
1180     {
1181         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1182         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1183         if( blueIdx == 0 )
1184             std::swap(coeffs[0], coeffs[2]);
1185
1186         v_cb = vdup_n_u16(coeffs[0]);
1187         v_cg = vdup_n_u16(coeffs[1]);
1188         v_cr = vdup_n_u16(coeffs[2]);
1189         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1190     }
1191
1192     void operator()(const ushort* src, ushort* dst, int n) const
1193     {
1194         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
1195
1196         for ( ; i <= n - 8; i += 8, src += scn * 8)
1197         {
1198             uint16x8_t v_b, v_r, v_g;
1199             if (scn == 3)
1200             {
1201                 uint16x8x3_t v_src = vld3q_u16(src);
1202                 v_b = v_src.val[0];
1203                 v_g = v_src.val[1];
1204                 v_r = v_src.val[2];
1205             }
1206             else
1207             {
1208                 uint16x8x4_t v_src = vld4q_u16(src);
1209                 v_b = v_src.val[0];
1210                 v_g = v_src.val[1];
1211                 v_r = v_src.val[2];
1212             }
1213
1214             uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
1215                                            vmull_u16(vget_low_u16(v_b), v_cb),
1216                                                      vget_low_u16(v_g), v_cg),
1217                                                      vget_low_u16(v_r), v_cr);
1218             uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
1219                                            vmull_u16(vget_high_u16(v_b), v_cb),
1220                                                      vget_high_u16(v_g), v_cg),
1221                                                      vget_high_u16(v_r), v_cr);
1222
1223             uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
1224             uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
1225
1226             vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
1227         }
1228
1229         for ( ; i <= n - 4; i += 4, src += scn * 4)
1230         {
1231             uint16x4_t v_b, v_r, v_g;
1232             if (scn == 3)
1233             {
1234                 uint16x4x3_t v_src = vld3_u16(src);
1235                 v_b = v_src.val[0];
1236                 v_g = v_src.val[1];
1237                 v_r = v_src.val[2];
1238             }
1239             else
1240             {
1241                 uint16x4x4_t v_src = vld4_u16(src);
1242                 v_b = v_src.val[0];
1243                 v_g = v_src.val[1];
1244                 v_r = v_src.val[2];
1245             }
1246
1247             uint32x4_t v_dst = vmlal_u16(vmlal_u16(
1248                                          vmull_u16(v_b, v_cb),
1249                                                    v_g, v_cg),
1250                                                    v_r, v_cr);
1251
1252             vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
1253         }
1254
1255         for( ; i < n; i++, src += scn)
1256             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1257     }
1258
1259     int srccn, coeffs[3];
1260     uint16x4_t v_cb, v_cg, v_cr;
1261     uint32x4_t v_delta;
1262 };
1263
1264 template <>
1265 struct RGB2Gray<float>
1266 {
1267     typedef float channel_type;
1268
1269     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1270     {
1271         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1272         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1273         if(blueIdx == 0)
1274             std::swap(coeffs[0], coeffs[2]);
1275
1276         v_cb = vdupq_n_f32(coeffs[0]);
1277         v_cg = vdupq_n_f32(coeffs[1]);
1278         v_cr = vdupq_n_f32(coeffs[2]);
1279     }
1280
1281     void operator()(const float * src, float * dst, int n) const
1282     {
1283         int scn = srccn, i = 0;
1284         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1285
1286         if (scn == 3)
1287         {
1288             for ( ; i <= n - 8; i += 8, src += scn * 8)
1289             {
1290                 float32x4x3_t v_src = vld3q_f32(src);
1291                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1292
1293                 v_src = vld3q_f32(src + scn * 4);
1294                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1295             }
1296
1297             for ( ; i <= n - 4; i += 4, src += scn * 4)
1298             {
1299                 float32x4x3_t v_src = vld3q_f32(src);
1300                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1301             }
1302         }
1303         else
1304         {
1305             for ( ; i <= n - 8; i += 8, src += scn * 8)
1306             {
1307                 float32x4x4_t v_src = vld4q_f32(src);
1308                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1309
1310                 v_src = vld4q_f32(src + scn * 4);
1311                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1312             }
1313
1314             for ( ; i <= n - 4; i += 4, src += scn * 4)
1315             {
1316                 float32x4x4_t v_src = vld4q_f32(src);
1317                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1318             }
1319         }
1320
1321         for ( ; i < n; i++, src += scn)
1322             dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
1323     }
1324
1325     int srccn;
1326     float coeffs[3];
1327     float32x4_t v_cb, v_cg, v_cr;
1328 };
1329
1330 #else
1331
1332 template<> struct RGB2Gray<ushort>
1333 {
1334     typedef ushort channel_type;
1335
1336     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
1337     {
1338         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1339         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1340         if( blueIdx == 0 )
1341             std::swap(coeffs[0], coeffs[2]);
1342     }
1343
1344     void operator()(const ushort* src, ushort* dst, int n) const
1345     {
1346         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1347         for(int i = 0; i < n; i++, src += scn)
1348             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1349     }
1350     int srccn;
1351     int coeffs[3];
1352 };
1353
1354 #endif
1355
1356 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
1357
1358 template<typename _Tp> struct RGB2YCrCb_f
1359 {
1360     typedef _Tp channel_type;
1361
1362     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
1363     {
1364         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1365         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1366         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1367     }
1368
1369     void operator()(const _Tp* src, _Tp* dst, int n) const
1370     {
1371         int scn = srccn, bidx = blueIdx;
1372         const _Tp delta = ColorChannel<_Tp>::half();
1373         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1374         n *= 3;
1375         for(int i = 0; i < n; i += 3, src += scn)
1376         {
1377             _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
1378             _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
1379             _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
1380             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1381         }
1382     }
1383     int srccn, blueIdx;
1384     float coeffs[5];
1385 };
1386
1387 #if CV_NEON
1388
1389 template <>
1390 struct RGB2YCrCb_f<float>
1391 {
1392     typedef float channel_type;
1393
1394     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
1395         srccn(_srccn), blueIdx(_blueIdx)
1396     {
1397         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1398         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1399         if(blueIdx==0)
1400             std::swap(coeffs[0], coeffs[2]);
1401
1402         v_c0 = vdupq_n_f32(coeffs[0]);
1403         v_c1 = vdupq_n_f32(coeffs[1]);
1404         v_c2 = vdupq_n_f32(coeffs[2]);
1405         v_c3 = vdupq_n_f32(coeffs[3]);
1406         v_c4 = vdupq_n_f32(coeffs[4]);
1407         v_delta = vdupq_n_f32(ColorChannel<float>::half());
1408     }
1409
1410     void operator()(const float * src, float * dst, int n) const
1411     {
1412         int scn = srccn, bidx = blueIdx, i = 0;
1413         const float delta = ColorChannel<float>::half();
1414         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1415         n *= 3;
1416
1417         if (scn == 3)
1418             for ( ; i <= n - 12; i += 12, src += 12)
1419             {
1420                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
1421                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1422                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1423                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1424
1425                 vst3q_f32(dst + i, v_dst);
1426             }
1427         else
1428             for ( ; i <= n - 12; i += 12, src += 16)
1429             {
1430                 float32x4x4_t v_src = vld4q_f32(src);
1431                 float32x4x3_t v_dst;
1432                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1433                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1434                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1435
1436                 vst3q_f32(dst + i, v_dst);
1437             }
1438
1439         for ( ; i < n; i += 3, src += scn)
1440         {
1441             float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
1442             float Cr = (src[bidx^2] - Y)*C3 + delta;
1443             float Cb = (src[bidx] - Y)*C4 + delta;
1444             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1445         }
1446     }
1447     int srccn, blueIdx;
1448     float coeffs[5];
1449     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
1450 };
1451
1452 #endif
1453
1454 template<typename _Tp> struct RGB2YCrCb_i
1455 {
1456     typedef _Tp channel_type;
1457
1458     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1459         : srccn(_srccn), blueIdx(_blueIdx)
1460     {
1461         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1462         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1463         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1464     }
1465     void operator()(const _Tp* src, _Tp* dst, int n) const
1466     {
1467         int scn = srccn, bidx = blueIdx;
1468         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1469         int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
1470         n *= 3;
1471         for(int i = 0; i < n; i += 3, src += scn)
1472         {
1473             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1474             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1475             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1476             dst[i] = saturate_cast<_Tp>(Y);
1477             dst[i+1] = saturate_cast<_Tp>(Cr);
1478             dst[i+2] = saturate_cast<_Tp>(Cb);
1479         }
1480     }
1481     int srccn, blueIdx;
1482     int coeffs[5];
1483 };
1484
1485 #if CV_NEON
1486
1487 template <>
1488 struct RGB2YCrCb_i<uchar>
1489 {
1490     typedef uchar channel_type;
1491
1492     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1493         : srccn(_srccn), blueIdx(_blueIdx)
1494     {
1495         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1496         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1497         if (blueIdx==0)
1498             std::swap(coeffs[0], coeffs[2]);
1499
1500         v_c0 = vdup_n_s16(coeffs[0]);
1501         v_c1 = vdup_n_s16(coeffs[1]);
1502         v_c2 = vdup_n_s16(coeffs[2]);
1503         v_c3 = vdupq_n_s32(coeffs[3]);
1504         v_c4 = vdupq_n_s32(coeffs[4]);
1505         v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
1506         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1507     }
1508
1509     void operator()(const uchar * src, uchar * dst, int n) const
1510     {
1511         int scn = srccn, bidx = blueIdx, i = 0;
1512         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1513         int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
1514         n *= 3;
1515
1516         for ( ; i <= n - 24; i += 24, src += scn * 8)
1517         {
1518             uint8x8x3_t v_dst;
1519             int16x8x3_t v_src16;
1520
1521             if (scn == 3)
1522             {
1523                 uint8x8x3_t v_src = vld3_u8(src);
1524                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1525                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1526                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1527             }
1528             else
1529             {
1530                 uint8x8x4_t v_src = vld4_u8(src);
1531                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1532                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1533                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1534             }
1535
1536             int16x4x3_t v_src0;
1537             v_src0.val[0] = vget_low_s16(v_src16.val[0]);
1538             v_src0.val[1] = vget_low_s16(v_src16.val[1]);
1539             v_src0.val[2] = vget_low_s16(v_src16.val[2]);
1540
1541             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1542             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
1543             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
1544             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
1545             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
1546             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
1547
1548             v_src0.val[0] = vget_high_s16(v_src16.val[0]);
1549             v_src0.val[1] = vget_high_s16(v_src16.val[1]);
1550             v_src0.val[2] = vget_high_s16(v_src16.val[2]);
1551
1552             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1553             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
1554             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
1555             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
1556             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
1557             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
1558
1559             v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
1560             v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
1561             v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
1562
1563             vst3_u8(dst + i, v_dst);
1564         }
1565
1566         for ( ; i < n; i += 3, src += scn)
1567         {
1568             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1569             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1570             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1571             dst[i] = saturate_cast<uchar>(Y);
1572             dst[i+1] = saturate_cast<uchar>(Cr);
1573             dst[i+2] = saturate_cast<uchar>(Cb);
1574         }
1575     }
1576     int srccn, blueIdx, coeffs[5];
1577     int16x4_t v_c0, v_c1, v_c2;
1578     int32x4_t v_c3, v_c4, v_delta, v_delta2;
1579 };
1580
1581 template <>
1582 struct RGB2YCrCb_i<ushort>
1583 {
1584     typedef ushort channel_type;
1585
1586     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1587         : srccn(_srccn), blueIdx(_blueIdx)
1588     {
1589         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1590         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1591         if (blueIdx==0)
1592             std::swap(coeffs[0], coeffs[2]);
1593
1594         v_c0 = vdupq_n_s32(coeffs[0]);
1595         v_c1 = vdupq_n_s32(coeffs[1]);
1596         v_c2 = vdupq_n_s32(coeffs[2]);
1597         v_c3 = vdupq_n_s32(coeffs[3]);
1598         v_c4 = vdupq_n_s32(coeffs[4]);
1599         v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
1600         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1601     }
1602
1603     void operator()(const ushort * src, ushort * dst, int n) const
1604     {
1605         int scn = srccn, bidx = blueIdx, i = 0;
1606         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1607         int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
1608         n *= 3;
1609
1610         for ( ; i <= n - 24; i += 24, src += scn * 8)
1611         {
1612             uint16x8x3_t v_src, v_dst;
1613             int32x4x3_t v_src0;
1614
1615             if (scn == 3)
1616                 v_src = vld3q_u16(src);
1617             else
1618             {
1619                 uint16x8x4_t v_src_ = vld4q_u16(src);
1620                 v_src.val[0] = v_src_.val[0];
1621                 v_src.val[1] = v_src_.val[1];
1622                 v_src.val[2] = v_src_.val[2];
1623             }
1624
1625             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
1626             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
1627             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
1628
1629             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1630             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
1631             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
1632             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
1633             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
1634             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
1635
1636             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
1637             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
1638             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
1639
1640             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1641             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
1642             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
1643             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
1644             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
1645             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
1646
1647             v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
1648             v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
1649             v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
1650
1651             vst3q_u16(dst + i, v_dst);
1652         }
1653
1654         for ( ; i <= n - 12; i += 12, src += scn * 4)
1655         {
1656             uint16x4x3_t v_dst;
1657             int32x4x3_t v_src0;
1658
1659             if (scn == 3)
1660             {
1661                 uint16x4x3_t v_src = vld3_u16(src);
1662                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
1663                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
1664                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
1665             }
1666             else
1667             {
1668                 uint16x4x4_t v_src = vld4_u16(src);
1669                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
1670                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
1671                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
1672             }
1673
1674             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1675             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
1676             int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
1677             v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
1678             int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
1679             v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
1680
1681             v_dst.val[0] = vqmovun_s32(v_Y);
1682             v_dst.val[1] = vqmovun_s32(v_Cr);
1683             v_dst.val[2] = vqmovun_s32(v_Cb);
1684
1685             vst3_u16(dst + i, v_dst);
1686         }
1687
1688         for ( ; i < n; i += 3, src += scn)
1689         {
1690             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1691             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1692             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1693             dst[i] = saturate_cast<ushort>(Y);
1694             dst[i+1] = saturate_cast<ushort>(Cr);
1695             dst[i+2] = saturate_cast<ushort>(Cb);
1696         }
1697     }
1698     int srccn, blueIdx, coeffs[5];
1699     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
1700 };
1701
1702 #endif
1703
1704 template<typename _Tp> struct YCrCb2RGB_f
1705 {
1706     typedef _Tp channel_type;
1707
1708     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
1709         : dstcn(_dstcn), blueIdx(_blueIdx)
1710     {
1711         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
1712         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1713     }
1714     void operator()(const _Tp* src, _Tp* dst, int n) const
1715     {
1716         int dcn = dstcn, bidx = blueIdx;
1717         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
1718         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1719         n *= 3;
1720         for(int i = 0; i < n; i += 3, dst += dcn)
1721         {
1722             _Tp Y = src[i];
1723             _Tp Cr = src[i+1];
1724             _Tp Cb = src[i+2];
1725
1726             _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
1727             _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
1728             _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
1729
1730             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
1731             if( dcn == 4 )
1732                 dst[3] = alpha;
1733         }
1734     }
1735     int dstcn, blueIdx;
1736     float coeffs[4];
1737 };
1738
1739 #if CV_NEON
1740
1741 template <>
1742 struct YCrCb2RGB_f<float>
1743 {
1744     typedef float channel_type;
1745
1746     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
1747         : dstcn(_dstcn), blueIdx(_blueIdx)
1748     {
1749         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
1750         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1751
1752         v_c0 = vdupq_n_f32(coeffs[0]);
1753         v_c1 = vdupq_n_f32(coeffs[1]);
1754         v_c2 = vdupq_n_f32(coeffs[2]);
1755         v_c3 = vdupq_n_f32(coeffs[3]);
1756         v_delta = vdupq_n_f32(ColorChannel<float>::half());
1757         v_alpha = vdupq_n_f32(ColorChannel<float>::max());
1758     }
1759
1760     void operator()(const float* src, float* dst, int n) const
1761     {
1762         int dcn = dstcn, bidx = blueIdx, i = 0;
1763         const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
1764         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1765         n *= 3;
1766
1767         if (dcn == 3)
1768             for ( ; i <= n - 12; i += 12, dst += 12)
1769             {
1770                 float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
1771                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
1772
1773                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
1774                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
1775                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
1776
1777                 vst3q_f32(dst, v_dst);
1778             }
1779         else
1780             for ( ; i <= n - 12; i += 12, dst += 16)
1781             {
1782                 float32x4x3_t v_src = vld3q_f32(src + i);
1783                 float32x4x4_t v_dst;
1784                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
1785
1786                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
1787                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
1788                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
1789                 v_dst.val[3] = v_alpha;
1790
1791                 vst4q_f32(dst, v_dst);
1792             }
1793
1794         for ( ; i < n; i += 3, dst += dcn)
1795         {
1796             float Y = src[i], Cr = src[i+1], Cb = src[i+2];
1797
1798             float b = Y + (Cb - delta)*C3;
1799             float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
1800             float r = Y + (Cr - delta)*C0;
1801
1802             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
1803             if( dcn == 4 )
1804                 dst[3] = alpha;
1805         }
1806     }
1807     int dstcn, blueIdx;
1808     float coeffs[4];
1809     float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
1810 };
1811
1812 #endif
1813
1814 template<typename _Tp> struct YCrCb2RGB_i
1815 {
1816     typedef _Tp channel_type;
1817
1818     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
1819         : dstcn(_dstcn), blueIdx(_blueIdx)
1820     {
1821         static const int coeffs0[] = {22987, -11698, -5636, 29049};
1822         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1823     }
1824
1825     void operator()(const _Tp* src, _Tp* dst, int n) const
1826     {
1827         int dcn = dstcn, bidx = blueIdx;
1828         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
1829         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1830         n *= 3;
1831         for(int i = 0; i < n; i += 3, dst += dcn)
1832         {
1833             _Tp Y = src[i];
1834             _Tp Cr = src[i+1];
1835             _Tp Cb = src[i+2];
1836
1837             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
1838             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
1839             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
1840
1841             dst[bidx] = saturate_cast<_Tp>(b);
1842             dst[1] = saturate_cast<_Tp>(g);
1843             dst[bidx^2] = saturate_cast<_Tp>(r);
1844             if( dcn == 4 )
1845                 dst[3] = alpha;
1846         }
1847     }
1848     int dstcn, blueIdx;
1849     int coeffs[4];
1850 };
1851
1852 #if CV_NEON
1853
1854 template <>
1855 struct YCrCb2RGB_i<uchar>
1856 {
1857     typedef uchar channel_type;
1858
1859     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
1860         : dstcn(_dstcn), blueIdx(_blueIdx)
1861     {
1862         static const int coeffs0[] = {22987, -11698, -5636, 29049};
1863         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1864
1865         v_c0 = vdupq_n_s32(coeffs[0]);
1866         v_c1 = vdupq_n_s32(coeffs[1]);
1867         v_c2 = vdupq_n_s32(coeffs[2]);
1868         v_c3 = vdupq_n_s32(coeffs[3]);
1869         v_delta = vdup_n_s16(ColorChannel<uchar>::half());
1870         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1871         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
1872     }
1873
1874     void operator()(const uchar* src, uchar* dst, int n) const
1875     {
1876         int dcn = dstcn, bidx = blueIdx, i = 0;
1877         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
1878         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1879         n *= 3;
1880
1881         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
1882         {
1883             uint8x8x3_t v_src = vld3_u8(src + i);
1884             int16x8x3_t v_src16;
1885             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1886             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1887             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1888
1889             int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
1890                       v_Cr = vget_low_s16(v_src16.val[1]),
1891                       v_Cb = vget_low_s16(v_src16.val[2]);
1892
1893             int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
1894             v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
1895             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
1896             v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
1897             int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
1898             v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
1899
1900             v_Y = vget_high_s16(v_src16.val[0]);
1901             v_Cr = vget_high_s16(v_src16.val[1]);
1902             v_Cb = vget_high_s16(v_src16.val[2]);
1903
1904             int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
1905             v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
1906             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
1907             v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
1908             int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
1909             v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
1910
1911             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
1912             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
1913             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
1914
1915             if (dcn == 3)
1916             {
1917                 uint8x8x3_t v_dst;
1918                 v_dst.val[bidx] = v_b;
1919                 v_dst.val[1] = v_g;
1920                 v_dst.val[bidx^2] = v_r;
1921                 vst3_u8(dst, v_dst);
1922             }
1923             else
1924             {
1925                 uint8x8x4_t v_dst;
1926                 v_dst.val[bidx] = v_b;
1927                 v_dst.val[1] = v_g;
1928                 v_dst.val[bidx^2] = v_r;
1929                 v_dst.val[3] = v_alpha;
1930                 vst4_u8(dst, v_dst);
1931             }
1932         }
1933
1934         for ( ; i < n; i += 3, dst += dcn)
1935         {
1936             uchar Y = src[i];
1937             uchar Cr = src[i+1];
1938             uchar Cb = src[i+2];
1939
1940             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
1941             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
1942             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
1943
1944             dst[bidx] = saturate_cast<uchar>(b);
1945             dst[1] = saturate_cast<uchar>(g);
1946             dst[bidx^2] = saturate_cast<uchar>(r);
1947             if( dcn == 4 )
1948                 dst[3] = alpha;
1949         }
1950     }
1951     int dstcn, blueIdx;
1952     int coeffs[4];
1953
1954     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
1955     int16x4_t v_delta;
1956     uint8x8_t v_alpha;
1957 };
1958
1959 template <>
1960 struct YCrCb2RGB_i<ushort>
1961 {
1962     typedef ushort channel_type;
1963
1964     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
1965         : dstcn(_dstcn), blueIdx(_blueIdx)
1966     {
1967         static const int coeffs0[] = {22987, -11698, -5636, 29049};
1968         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1969
1970         v_c0 = vdupq_n_s32(coeffs[0]);
1971         v_c1 = vdupq_n_s32(coeffs[1]);
1972         v_c2 = vdupq_n_s32(coeffs[2]);
1973         v_c3 = vdupq_n_s32(coeffs[3]);
1974         v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
1975         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1976         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
1977         v_alpha2 = vget_low_u16(v_alpha);
1978     }
1979
1980     void operator()(const ushort* src, ushort* dst, int n) const
1981     {
1982         int dcn = dstcn, bidx = blueIdx, i = 0;
1983         const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
1984         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1985         n *= 3;
1986
1987         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
1988         {
1989             uint16x8x3_t v_src = vld3q_u16(src + i);
1990
1991             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
1992                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
1993                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
1994
1995             int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
1996             v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
1997             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
1998             v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
1999             int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2000             v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
2001
2002             v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
2003             v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
2004             v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2005
2006             int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2007             v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
2008             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2009             v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
2010             int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2011             v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
2012
2013             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
2014             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
2015             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
2016
2017             if (dcn == 3)
2018             {
2019                 uint16x8x3_t v_dst;
2020                 v_dst.val[bidx] = v_b;
2021                 v_dst.val[1] = v_g;
2022                 v_dst.val[bidx^2] = v_r;
2023                 vst3q_u16(dst, v_dst);
2024             }
2025             else
2026             {
2027                 uint16x8x4_t v_dst;
2028                 v_dst.val[bidx] = v_b;
2029                 v_dst.val[1] = v_g;
2030                 v_dst.val[bidx^2] = v_r;
2031                 v_dst.val[3] = v_alpha;
2032                 vst4q_u16(dst, v_dst);
2033             }
2034         }
2035
2036         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
2037         {
2038             uint16x4x3_t v_src = vld3_u16(src + i);
2039
2040             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
2041                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
2042                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2043
2044             int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2045             v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
2046             int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2047             v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
2048             int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
2049             v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
2050
2051             uint16x4_t v_bd = vqmovun_s32(v_b);
2052             uint16x4_t v_gd = vqmovun_s32(v_g);
2053             uint16x4_t v_rd = vqmovun_s32(v_r);
2054
2055             if (dcn == 3)
2056             {
2057                 uint16x4x3_t v_dst;
2058                 v_dst.val[bidx] = v_bd;
2059                 v_dst.val[1] = v_gd;
2060                 v_dst.val[bidx^2] = v_rd;
2061                 vst3_u16(dst, v_dst);
2062             }
2063             else
2064             {
2065                 uint16x4x4_t v_dst;
2066                 v_dst.val[bidx] = v_bd;
2067                 v_dst.val[1] = v_gd;
2068                 v_dst.val[bidx^2] = v_rd;
2069                 v_dst.val[3] = v_alpha2;
2070                 vst4_u16(dst, v_dst);
2071             }
2072         }
2073
2074         for ( ; i < n; i += 3, dst += dcn)
2075         {
2076             ushort Y = src[i];
2077             ushort Cr = src[i+1];
2078             ushort Cb = src[i+2];
2079
2080             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2081             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2082             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2083
2084             dst[bidx] = saturate_cast<ushort>(b);
2085             dst[1] = saturate_cast<ushort>(g);
2086             dst[bidx^2] = saturate_cast<ushort>(r);
2087             if( dcn == 4 )
2088                 dst[3] = alpha;
2089         }
2090     }
2091     int dstcn, blueIdx;
2092     int coeffs[4];
2093
2094     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
2095     uint16x8_t v_alpha;
2096     uint16x4_t v_alpha2;
2097 };
2098
2099 #endif
2100
2101 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
2102
2103 static const float sRGB2XYZ_D65[] =
2104 {
2105     0.412453f, 0.357580f, 0.180423f,
2106     0.212671f, 0.715160f, 0.072169f,
2107     0.019334f, 0.119193f, 0.950227f
2108 };
2109
2110 static const float XYZ2sRGB_D65[] =
2111 {
2112     3.240479f, -1.53715f, -0.498535f,
2113     -0.969256f, 1.875991f, 0.041556f,
2114     0.055648f, -0.204043f, 1.057311f
2115 };
2116
2117 template<typename _Tp> struct RGB2XYZ_f
2118 {
2119     typedef _Tp channel_type;
2120
2121     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2122     {
2123         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
2124         if(blueIdx == 0)
2125         {
2126             std::swap(coeffs[0], coeffs[2]);
2127             std::swap(coeffs[3], coeffs[5]);
2128             std::swap(coeffs[6], coeffs[8]);
2129         }
2130     }
2131     void operator()(const _Tp* src, _Tp* dst, int n) const
2132     {
2133         int scn = srccn;
2134         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2135               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2136               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2137
2138         n *= 3;
2139         for(int i = 0; i < n; i += 3, src += scn)
2140         {
2141             _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
2142             _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
2143             _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
2144             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
2145         }
2146     }
2147     int srccn;
2148     float coeffs[9];
2149 };
2150
2151 #if CV_NEON
2152
2153 template <>
2154 struct RGB2XYZ_f<float>
2155 {
2156     typedef float channel_type;
2157
2158     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2159     {
2160         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
2161         if(blueIdx == 0)
2162         {
2163             std::swap(coeffs[0], coeffs[2]);
2164             std::swap(coeffs[3], coeffs[5]);
2165             std::swap(coeffs[6], coeffs[8]);
2166         }
2167
2168         v_c0 = vdupq_n_f32(coeffs[0]);
2169         v_c1 = vdupq_n_f32(coeffs[1]);
2170         v_c2 = vdupq_n_f32(coeffs[2]);
2171         v_c3 = vdupq_n_f32(coeffs[3]);
2172         v_c4 = vdupq_n_f32(coeffs[4]);
2173         v_c5 = vdupq_n_f32(coeffs[5]);
2174         v_c6 = vdupq_n_f32(coeffs[6]);
2175         v_c7 = vdupq_n_f32(coeffs[7]);
2176         v_c8 = vdupq_n_f32(coeffs[8]);
2177     }
2178
2179     void operator()(const float* src, float* dst, int n) const
2180     {
2181         int scn = srccn, i = 0;
2182         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2183               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2184               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2185
2186         n *= 3;
2187
2188         if (scn == 3)
2189             for ( ; i <= n - 12; i += 12, src += 12)
2190             {
2191                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
2192                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
2193                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
2194                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
2195                 vst3q_f32(dst + i, v_dst);
2196             }
2197         else
2198             for ( ; i <= n - 12; i += 12, src += 16)
2199             {
2200                 float32x4x4_t v_src = vld4q_f32(src);
2201                 float32x4x3_t v_dst;
2202                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
2203                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
2204                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
2205                 vst3q_f32(dst + i, v_dst);
2206             }
2207
2208         for ( ; i < n; i += 3, src += scn)
2209         {
2210             float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
2211             float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
2212             float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
2213             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
2214         }
2215     }
2216
2217     int srccn;
2218     float coeffs[9];
2219     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2220 };
2221
2222 #endif
2223
2224 template<typename _Tp> struct RGB2XYZ_i
2225 {
2226     typedef _Tp channel_type;
2227
2228     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2229     {
2230         static const int coeffs0[] =
2231         {
2232             1689,    1465,    739,
2233             871,     2929,    296,
2234             79,      488,     3892
2235         };
2236         for( int i = 0; i < 9; i++ )
2237             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2238         if(blueIdx == 0)
2239         {
2240             std::swap(coeffs[0], coeffs[2]);
2241             std::swap(coeffs[3], coeffs[5]);
2242             std::swap(coeffs[6], coeffs[8]);
2243         }
2244     }
2245     void operator()(const _Tp* src, _Tp* dst, int n) const
2246     {
2247         int scn = srccn;
2248         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2249             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2250             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2251         n *= 3;
2252         for(int i = 0; i < n; i += 3, src += scn)
2253         {
2254             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
2255             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
2256             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
2257             dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
2258             dst[i+2] = saturate_cast<_Tp>(Z);
2259         }
2260     }
2261     int srccn;
2262     int coeffs[9];
2263 };
2264
2265 #if CV_NEON
2266
2267 template <>
2268 struct RGB2XYZ_i<uchar>
2269 {
2270     typedef uchar channel_type;
2271
2272     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2273     {
2274         static const int coeffs0[] =
2275         {
2276             1689,    1465,    739,
2277             871,     2929,    296,
2278             79,      488,     3892
2279         };
2280         for( int i = 0; i < 9; i++ )
2281             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2282         if(blueIdx == 0)
2283         {
2284             std::swap(coeffs[0], coeffs[2]);
2285             std::swap(coeffs[3], coeffs[5]);
2286             std::swap(coeffs[6], coeffs[8]);
2287         }
2288
2289         v_c0 = vdup_n_u16(coeffs[0]);
2290         v_c1 = vdup_n_u16(coeffs[1]);
2291         v_c2 = vdup_n_u16(coeffs[2]);
2292         v_c3 = vdup_n_u16(coeffs[3]);
2293         v_c4 = vdup_n_u16(coeffs[4]);
2294         v_c5 = vdup_n_u16(coeffs[5]);
2295         v_c6 = vdup_n_u16(coeffs[6]);
2296         v_c7 = vdup_n_u16(coeffs[7]);
2297         v_c8 = vdup_n_u16(coeffs[8]);
2298         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
2299     }
2300     void operator()(const uchar * src, uchar * dst, int n) const
2301     {
2302         int scn = srccn, i = 0;
2303         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2304             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2305             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2306         n *= 3;
2307
2308         for ( ; i <= n - 24; i += 24, src += scn * 8)
2309         {
2310             uint8x8x3_t v_dst;
2311             uint16x8x3_t v_src16;
2312
2313             if (scn == 3)
2314             {
2315                 uint8x8x3_t v_src = vld3_u8(src);
2316                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
2317                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
2318                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
2319             }
2320             else
2321             {
2322                 uint8x8x4_t v_src = vld4_u8(src);
2323                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
2324                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
2325                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
2326             }
2327
2328             uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
2329                        v_s1 = vget_low_u16(v_src16.val[1]),
2330                        v_s2 = vget_low_u16(v_src16.val[2]);
2331
2332             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2333             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2334             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2335             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
2336             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
2337             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
2338
2339             v_s0 = vget_high_u16(v_src16.val[0]),
2340             v_s1 = vget_high_u16(v_src16.val[1]),
2341             v_s2 = vget_high_u16(v_src16.val[2]);
2342
2343             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2344             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2345             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2346             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
2347             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
2348             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
2349
2350             v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
2351             v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
2352             v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
2353
2354             vst3_u8(dst + i, v_dst);
2355         }
2356
2357         for ( ; i < n; i += 3, src += scn)
2358         {
2359             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
2360             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
2361             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
2362             dst[i] = saturate_cast<uchar>(X);
2363             dst[i+1] = saturate_cast<uchar>(Y);
2364             dst[i+2] = saturate_cast<uchar>(Z);
2365         }
2366     }
2367
2368     int srccn, coeffs[9];
2369     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2370     uint32x4_t v_delta;
2371 };
2372
2373 template <>
2374 struct RGB2XYZ_i<ushort>
2375 {
2376     typedef ushort channel_type;
2377
2378     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2379     {
2380         static const int coeffs0[] =
2381         {
2382             1689,    1465,    739,
2383             871,     2929,    296,
2384             79,      488,     3892
2385         };
2386         for( int i = 0; i < 9; i++ )
2387             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2388         if(blueIdx == 0)
2389         {
2390             std::swap(coeffs[0], coeffs[2]);
2391             std::swap(coeffs[3], coeffs[5]);
2392             std::swap(coeffs[6], coeffs[8]);
2393         }
2394
2395         v_c0 = vdup_n_u16(coeffs[0]);
2396         v_c1 = vdup_n_u16(coeffs[1]);
2397         v_c2 = vdup_n_u16(coeffs[2]);
2398         v_c3 = vdup_n_u16(coeffs[3]);
2399         v_c4 = vdup_n_u16(coeffs[4]);
2400         v_c5 = vdup_n_u16(coeffs[5]);
2401         v_c6 = vdup_n_u16(coeffs[6]);
2402         v_c7 = vdup_n_u16(coeffs[7]);
2403         v_c8 = vdup_n_u16(coeffs[8]);
2404         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
2405     }
2406
2407     void operator()(const ushort * src, ushort * dst, int n) const
2408     {
2409         int scn = srccn, i = 0;
2410         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2411             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2412             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2413         n *= 3;
2414
2415         for ( ; i <= n - 24; i += 24, src += scn * 8)
2416         {
2417             uint16x8x3_t v_src, v_dst;
2418
2419             if (scn == 3)
2420                 v_src = vld3q_u16(src);
2421             else
2422             {
2423                 uint16x8x4_t v_src4 = vld4q_u16(src);
2424                 v_src.val[0] = v_src4.val[0];
2425                 v_src.val[1] = v_src4.val[1];
2426                 v_src.val[2] = v_src4.val[2];
2427             }
2428
2429             uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
2430                        v_s1 = vget_low_u16(v_src.val[1]),
2431                        v_s2 = vget_low_u16(v_src.val[2]);
2432
2433             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2434             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2435             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2436             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
2437             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
2438             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
2439
2440             v_s0 = vget_high_u16(v_src.val[0]),
2441             v_s1 = vget_high_u16(v_src.val[1]),
2442             v_s2 = vget_high_u16(v_src.val[2]);
2443
2444             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2445             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2446             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2447             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
2448             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
2449             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
2450
2451             v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
2452             v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
2453             v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
2454
2455             vst3q_u16(dst + i, v_dst);
2456         }
2457
2458         for ( ; i <= n - 12; i += 12, src += scn * 4)
2459         {
2460             uint16x4x3_t v_dst;
2461             uint16x4_t v_s0, v_s1, v_s2;
2462
2463             if (scn == 3)
2464             {
2465                 uint16x4x3_t v_src = vld3_u16(src);
2466                 v_s0 = v_src.val[0];
2467                 v_s1 = v_src.val[1];
2468                 v_s2 = v_src.val[2];
2469             }
2470             else
2471             {
2472                 uint16x4x4_t v_src = vld4_u16(src);
2473                 v_s0 = v_src.val[0];
2474                 v_s1 = v_src.val[1];
2475                 v_s2 = v_src.val[2];
2476             }
2477
2478             uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2479             uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2480             uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2481
2482             v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
2483             v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
2484             v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
2485
2486             vst3_u16(dst + i, v_dst);
2487         }
2488
2489         for ( ; i < n; i += 3, src += scn)
2490         {
2491             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
2492             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
2493             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
2494             dst[i] = saturate_cast<ushort>(X);
2495             dst[i+1] = saturate_cast<ushort>(Y);
2496             dst[i+2] = saturate_cast<ushort>(Z);
2497         }
2498     }
2499
2500     int srccn, coeffs[9];
2501     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2502     uint32x4_t v_delta;
2503 };
2504
2505 #endif
2506
2507 template<typename _Tp> struct XYZ2RGB_f
2508 {
2509     typedef _Tp channel_type;
2510
2511     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2512     : dstcn(_dstcn), blueIdx(_blueIdx)
2513     {
2514         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
2515         if(blueIdx == 0)
2516         {
2517             std::swap(coeffs[0], coeffs[6]);
2518             std::swap(coeffs[1], coeffs[7]);
2519             std::swap(coeffs[2], coeffs[8]);
2520         }
2521     }
2522
2523     void operator()(const _Tp* src, _Tp* dst, int n) const
2524     {
2525         int dcn = dstcn;
2526         _Tp alpha = ColorChannel<_Tp>::max();
2527         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2528               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2529               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2530         n *= 3;
2531         for(int i = 0; i < n; i += 3, dst += dcn)
2532         {
2533             _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
2534             _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
2535             _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
2536             dst[0] = B; dst[1] = G; dst[2] = R;
2537             if( dcn == 4 )
2538                 dst[3] = alpha;
2539         }
2540     }
2541     int dstcn, blueIdx;
2542     float coeffs[9];
2543 };
2544
2545
2546 template<typename _Tp> struct XYZ2RGB_i
2547 {
2548     typedef _Tp channel_type;
2549
2550     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2551     : dstcn(_dstcn), blueIdx(_blueIdx)
2552     {
2553         static const int coeffs0[] =
2554         {
2555             13273,  -6296,  -2042,
2556             -3970,   7684,    170,
2557               228,   -836,   4331
2558         };
2559         for(int i = 0; i < 9; i++)
2560             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2561
2562         if(blueIdx == 0)
2563         {
2564             std::swap(coeffs[0], coeffs[6]);
2565             std::swap(coeffs[1], coeffs[7]);
2566             std::swap(coeffs[2], coeffs[8]);
2567         }
2568     }
2569     void operator()(const _Tp* src, _Tp* dst, int n) const
2570     {
2571         int dcn = dstcn;
2572         _Tp alpha = ColorChannel<_Tp>::max();
2573         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2574             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2575             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2576         n *= 3;
2577         for(int i = 0; i < n; i += 3, dst += dcn)
2578         {
2579             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
2580             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
2581             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
2582             dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
2583             dst[2] = saturate_cast<_Tp>(R);
2584             if( dcn == 4 )
2585                 dst[3] = alpha;
2586         }
2587     }
2588     int dstcn, blueIdx;
2589     int coeffs[9];
2590 };
2591
2592 #if CV_NEON
2593
2594 template <>
2595 struct XYZ2RGB_i<uchar>
2596 {
2597     typedef uchar channel_type;
2598
2599     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2600     : dstcn(_dstcn), blueIdx(_blueIdx)
2601     {
2602         static const int coeffs0[] =
2603         {
2604             13273,  -6296,  -2042,
2605             -3970,   7684,    170,
2606               228,   -836,   4331
2607         };
2608         for(int i = 0; i < 9; i++)
2609             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2610
2611         if(blueIdx == 0)
2612         {
2613             std::swap(coeffs[0], coeffs[6]);
2614             std::swap(coeffs[1], coeffs[7]);
2615             std::swap(coeffs[2], coeffs[8]);
2616         }
2617
2618         v_c0 = vdup_n_s16(coeffs[0]);
2619         v_c1 = vdup_n_s16(coeffs[1]);
2620         v_c2 = vdup_n_s16(coeffs[2]);
2621         v_c3 = vdup_n_s16(coeffs[3]);
2622         v_c4 = vdup_n_s16(coeffs[4]);
2623         v_c5 = vdup_n_s16(coeffs[5]);
2624         v_c6 = vdup_n_s16(coeffs[6]);
2625         v_c7 = vdup_n_s16(coeffs[7]);
2626         v_c8 = vdup_n_s16(coeffs[8]);
2627         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
2628         v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
2629     }
2630
2631     void operator()(const uchar* src, uchar* dst, int n) const
2632     {
2633         int dcn = dstcn, i = 0;
2634         uchar alpha = ColorChannel<uchar>::max();
2635         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2636             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2637             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2638         n *= 3;
2639
2640         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2641         {
2642             uint8x8x3_t v_src = vld3_u8(src + i);
2643             int16x8x3_t v_src16;
2644             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
2645             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
2646             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
2647
2648             int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
2649                        v_s1 = vget_low_s16(v_src16.val[1]),
2650                        v_s2 = vget_low_s16(v_src16.val[2]);
2651
2652             int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2653             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2654             int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2655             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
2656             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
2657             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
2658
2659             v_s0 = vget_high_s16(v_src16.val[0]),
2660             v_s1 = vget_high_s16(v_src16.val[1]),
2661             v_s2 = vget_high_s16(v_src16.val[2]);
2662
2663             int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2664             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2665             int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2666             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
2667             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
2668             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
2669
2670             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
2671             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
2672             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
2673
2674             if (dcn == 3)
2675             {
2676                 uint8x8x3_t v_dst;
2677                 v_dst.val[0] = v_b;
2678                 v_dst.val[1] = v_g;
2679                 v_dst.val[2] = v_r;
2680                 vst3_u8(dst, v_dst);
2681             }
2682             else
2683             {
2684                 uint8x8x4_t v_dst;
2685                 v_dst.val[0] = v_b;
2686                 v_dst.val[1] = v_g;
2687                 v_dst.val[2] = v_r;
2688                 v_dst.val[3] = v_alpha;
2689                 vst4_u8(dst, v_dst);
2690             }
2691         }
2692
2693         for ( ; i < n; i += 3, dst += dcn)
2694         {
2695             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
2696             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
2697             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
2698             dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
2699             dst[2] = saturate_cast<uchar>(R);
2700             if( dcn == 4 )
2701                 dst[3] = alpha;
2702         }
2703     }
2704     int dstcn, blueIdx;
2705     int coeffs[9];
2706
2707     int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2708     uint8x8_t v_alpha;
2709     int32x4_t v_delta;
2710 };
2711
2712 template <>
2713 struct XYZ2RGB_i<ushort>
2714 {
2715     typedef ushort channel_type;
2716
2717     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2718     : dstcn(_dstcn), blueIdx(_blueIdx)
2719     {
2720         static const int coeffs0[] =
2721         {
2722             13273,  -6296,  -2042,
2723             -3970,   7684,    170,
2724               228,   -836,   4331
2725         };
2726         for(int i = 0; i < 9; i++)
2727             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2728
2729         if(blueIdx == 0)
2730         {
2731             std::swap(coeffs[0], coeffs[6]);
2732             std::swap(coeffs[1], coeffs[7]);
2733             std::swap(coeffs[2], coeffs[8]);
2734         }
2735
2736         v_c0 = vdupq_n_s32(coeffs[0]);
2737         v_c1 = vdupq_n_s32(coeffs[1]);
2738         v_c2 = vdupq_n_s32(coeffs[2]);
2739         v_c3 = vdupq_n_s32(coeffs[3]);
2740         v_c4 = vdupq_n_s32(coeffs[4]);
2741         v_c5 = vdupq_n_s32(coeffs[5]);
2742         v_c6 = vdupq_n_s32(coeffs[6]);
2743         v_c7 = vdupq_n_s32(coeffs[7]);
2744         v_c8 = vdupq_n_s32(coeffs[8]);
2745         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
2746         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
2747         v_alpha2 = vget_low_u16(v_alpha);
2748     }
2749
2750     void operator()(const ushort* src, ushort* dst, int n) const
2751     {
2752         int dcn = dstcn, i = 0;
2753         ushort alpha = ColorChannel<ushort>::max();
2754         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2755             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2756             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2757         n *= 3;
2758
2759         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2760         {
2761             uint16x8x3_t v_src = vld3q_u16(src + i);
2762             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
2763                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
2764                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
2765
2766             int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2767             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2768             int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2769             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
2770             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
2771             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
2772
2773             v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
2774             v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
2775             v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2776
2777             int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2778             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2779             int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2780             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
2781             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
2782             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
2783
2784             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
2785             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
2786             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
2787
2788             if (dcn == 3)
2789             {
2790                 uint16x8x3_t v_dst;
2791                 v_dst.val[0] = v_b;
2792                 v_dst.val[1] = v_g;
2793                 v_dst.val[2] = v_r;
2794                 vst3q_u16(dst, v_dst);
2795             }
2796             else
2797             {
2798                 uint16x8x4_t v_dst;
2799                 v_dst.val[0] = v_b;
2800                 v_dst.val[1] = v_g;
2801                 v_dst.val[2] = v_r;
2802                 v_dst.val[3] = v_alpha;
2803                 vst4q_u16(dst, v_dst);
2804             }
2805         }
2806
2807         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
2808         {
2809             uint16x4x3_t v_src = vld3_u16(src + i);
2810             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
2811                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
2812                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2813
2814             int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2815             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2816             int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2817             v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
2818             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
2819             v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
2820
2821             uint16x4_t v_b = vqmovun_s32(v_X);
2822             uint16x4_t v_g = vqmovun_s32(v_Y);
2823             uint16x4_t v_r = vqmovun_s32(v_Z);
2824
2825             if (dcn == 3)
2826             {
2827                 uint16x4x3_t v_dst;
2828                 v_dst.val[0] = v_b;
2829                 v_dst.val[1] = v_g;
2830                 v_dst.val[2] = v_r;
2831                 vst3_u16(dst, v_dst);
2832             }
2833             else
2834             {
2835                 uint16x4x4_t v_dst;
2836                 v_dst.val[0] = v_b;
2837                 v_dst.val[1] = v_g;
2838                 v_dst.val[2] = v_r;
2839                 v_dst.val[3] = v_alpha2;
2840                 vst4_u16(dst, v_dst);
2841             }
2842         }
2843
2844         for ( ; i < n; i += 3, dst += dcn)
2845         {
2846             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
2847             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
2848             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
2849             dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
2850             dst[2] = saturate_cast<ushort>(R);
2851             if( dcn == 4 )
2852                 dst[3] = alpha;
2853         }
2854     }
2855     int dstcn, blueIdx;
2856     int coeffs[9];
2857
2858     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
2859     uint16x4_t v_alpha2;
2860     uint16x8_t v_alpha;
2861 };
2862
2863 #endif
2864
2865 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
2866
2867
2868 struct RGB2HSV_b
2869 {
2870     typedef uchar channel_type;
2871
2872     RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
2873     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
2874     {
2875         CV_Assert( hrange == 180 || hrange == 256 );
2876     }
2877
2878     void operator()(const uchar* src, uchar* dst, int n) const
2879     {
2880         int i, bidx = blueIdx, scn = srccn;
2881         const int hsv_shift = 12;
2882
2883         static int sdiv_table[256];
2884         static int hdiv_table180[256];
2885         static int hdiv_table256[256];
2886         static volatile bool initialized = false;
2887
2888         int hr = hrange;
2889         const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
2890         n *= 3;
2891
2892         if( !initialized )
2893         {
2894             sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
2895             for( i = 1; i < 256; i++ )
2896             {
2897                 sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
2898                 hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
2899                 hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
2900             }
2901             initialized = true;
2902         }
2903
2904         for( i = 0; i < n; i += 3, src += scn )
2905         {
2906             int b = src[bidx], g = src[1], r = src[bidx^2];
2907             int h, s, v = b;
2908             int vmin = b, diff;
2909             int vr, vg;
2910
2911             CV_CALC_MAX_8U( v, g );
2912             CV_CALC_MAX_8U( v, r );
2913             CV_CALC_MIN_8U( vmin, g );
2914             CV_CALC_MIN_8U( vmin, r );
2915
2916             diff = v - vmin;
2917             vr = v == r ? -1 : 0;
2918             vg = v == g ? -1 : 0;
2919
2920             s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
2921             h = (vr & (g - b)) +
2922                 (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
2923             h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
2924             h += h < 0 ? hr : 0;
2925
2926             dst[i] = saturate_cast<uchar>(h);
2927             dst[i+1] = (uchar)s;
2928             dst[i+2] = (uchar)v;
2929         }
2930     }
2931
2932     int srccn, blueIdx, hrange;
2933 };
2934
2935
2936 struct RGB2HSV_f
2937 {
2938     typedef float channel_type;
2939
2940     RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
2941     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
2942
2943     void operator()(const float* src, float* dst, int n) const
2944     {
2945         int i, bidx = blueIdx, scn = srccn;
2946         float hscale = hrange*(1.f/360.f);
2947         n *= 3;
2948
2949         for( i = 0; i < n; i += 3, src += scn )
2950         {
2951             float b = src[bidx], g = src[1], r = src[bidx^2];
2952             float h, s, v;
2953
2954             float vmin, diff;
2955
2956             v = vmin = r;
2957             if( v < g ) v = g;
2958             if( v < b ) v = b;
2959             if( vmin > g ) vmin = g;
2960             if( vmin > b ) vmin = b;
2961
2962             diff = v - vmin;
2963             s = diff/(float)(fabs(v) + FLT_EPSILON);
2964             diff = (float)(60./(diff + FLT_EPSILON));
2965             if( v == r )
2966                 h = (g - b)*diff;
2967             else if( v == g )
2968                 h = (b - r)*diff + 120.f;
2969             else
2970                 h = (r - g)*diff + 240.f;
2971
2972             if( h < 0 ) h += 360.f;
2973
2974             dst[i] = h*hscale;
2975             dst[i+1] = s;
2976             dst[i+2] = v;
2977         }
2978     }
2979
2980     int srccn, blueIdx;
2981     float hrange;
2982 };
2983
2984
2985 struct HSV2RGB_f
2986 {
2987     typedef float channel_type;
2988
2989     HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
2990     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
2991
2992     void operator()(const float* src, float* dst, int n) const
2993     {
2994         int i, bidx = blueIdx, dcn = dstcn;
2995         float _hscale = hscale;
2996         float alpha = ColorChannel<float>::max();
2997         n *= 3;
2998
2999         for( i = 0; i < n; i += 3, dst += dcn )
3000         {
3001             float h = src[i], s = src[i+1], v = src[i+2];
3002             float b, g, r;
3003
3004             if( s == 0 )
3005                 b = g = r = v;
3006             else
3007             {
3008                 static const int sector_data[][3]=
3009                     {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
3010                 float tab[4];
3011                 int sector;
3012                 h *= _hscale;
3013                 if( h < 0 )
3014                     do h += 6; while( h < 0 );
3015                 else if( h >= 6 )
3016                     do h -= 6; while( h >= 6 );
3017                 sector = cvFloor(h);
3018                 h -= sector;
3019                 if( (unsigned)sector >= 6u )
3020                 {
3021                     sector = 0;
3022                     h = 0.f;
3023                 }
3024
3025                 tab[0] = v;
3026                 tab[1] = v*(1.f - s);
3027                 tab[2] = v*(1.f - s*h);
3028                 tab[3] = v*(1.f - s*(1.f - h));
3029
3030                 b = tab[sector_data[sector][0]];
3031                 g = tab[sector_data[sector][1]];
3032                 r = tab[sector_data[sector][2]];
3033             }
3034
3035             dst[bidx] = b;
3036             dst[1] = g;
3037             dst[bidx^2] = r;
3038             if( dcn == 4 )
3039                 dst[3] = alpha;
3040         }
3041     }
3042
3043     int dstcn, blueIdx;
3044     float hscale;
3045 };
3046
3047
3048 struct HSV2RGB_b
3049 {
3050     typedef uchar channel_type;
3051
3052     HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
3053     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
3054     {
3055         #if CV_NEON
3056         v_scale_inv = vdupq_n_f32(1.f/255.f);
3057         v_scale = vdupq_n_f32(255.f);
3058         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3059         #endif
3060     }
3061
3062     void operator()(const uchar* src, uchar* dst, int n) const
3063     {
3064         int i, j, dcn = dstcn;
3065         uchar alpha = ColorChannel<uchar>::max();
3066         float buf[3*BLOCK_SIZE];
3067
3068         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
3069         {
3070             int dn = std::min(n - i, (int)BLOCK_SIZE);
3071             j = 0;
3072
3073             #if CV_NEON
3074             for ( ; j <= (dn - 8) * 3; j += 24)
3075             {
3076                 uint8x8x3_t v_src = vld3_u8(src + j);
3077                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
3078                            v_t1 = vmovl_u8(v_src.val[1]),
3079                            v_t2 = vmovl_u8(v_src.val[2]);
3080
3081                 float32x4x3_t v_dst;
3082                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
3083                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
3084                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
3085                 vst3q_f32(buf + j, v_dst);
3086
3087                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
3088                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
3089                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
3090                 vst3q_f32(buf + j + 12, v_dst);
3091             }
3092             #endif
3093
3094             for( ; j < dn*3; j += 3 )
3095             {
3096                 buf[j] = src[j];
3097                 buf[j+1] = src[j+1]*(1.f/255.f);
3098                 buf[j+2] = src[j+2]*(1.f/255.f);
3099             }
3100             cvt(buf, buf, dn);
3101
3102             j = 0;
3103             #if CV_NEON
3104             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
3105             {
3106                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3107                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
3108                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
3109                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3110                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3111                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3112                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3113
3114                 if (dcn == 4)
3115                 {
3116                     uint8x8x4_t v_dst;
3117                     v_dst.val[0] = v_dst0;
3118                     v_dst.val[1] = v_dst1;
3119                     v_dst.val[2] = v_dst2;
3120                     v_dst.val[3] = v_alpha;
3121                     vst4_u8(dst, v_dst);
3122                 }
3123                 else
3124                 {
3125                     uint8x8x3_t v_dst;
3126                     v_dst.val[0] = v_dst0;
3127                     v_dst.val[1] = v_dst1;
3128                     v_dst.val[2] = v_dst2;
3129                     vst3_u8(dst, v_dst);
3130                 }
3131             }
3132             #endif
3133
3134             for( ; j < dn*3; j += 3, dst += dcn )
3135             {
3136                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
3137                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
3138                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
3139                 if( dcn == 4 )
3140                     dst[3] = alpha;
3141             }
3142         }
3143     }
3144
3145     int dstcn;
3146     HSV2RGB_f cvt;
3147     #if CV_NEON
3148     float32x4_t v_scale, v_scale_inv;
3149     uint8x8_t v_alpha;
3150     #endif
3151 };
3152
3153
3154 ///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
3155
3156 struct RGB2HLS_f
3157 {
3158     typedef float channel_type;
3159
3160     RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
3161     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
3162
3163     void operator()(const float* src, float* dst, int n) const
3164     {
3165         int i, bidx = blueIdx, scn = srccn;
3166         float hscale = hrange*(1.f/360.f);
3167         n *= 3;
3168
3169         for( i = 0; i < n; i += 3, src += scn )
3170         {
3171             float b = src[bidx], g = src[1], r = src[bidx^2];
3172             float h = 0.f, s = 0.f, l;
3173             float vmin, vmax, diff;
3174
3175             vmax = vmin = r;
3176             if( vmax < g ) vmax = g;
3177             if( vmax < b ) vmax = b;
3178             if( vmin > g ) vmin = g;
3179             if( vmin > b ) vmin = b;
3180
3181             diff = vmax - vmin;
3182             l = (vmax + vmin)*0.5f;
3183
3184             if( diff > FLT_EPSILON )
3185             {
3186                 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
3187                 diff = 60.f/diff;
3188
3189                 if( vmax == r )
3190                     h = (g - b)*diff;
3191                 else if( vmax == g )
3192                     h = (b - r)*diff + 120.f;
3193                 else
3194                     h = (r - g)*diff + 240.f;
3195
3196                 if( h < 0.f ) h += 360.f;
3197             }
3198
3199             dst[i] = h*hscale;
3200             dst[i+1] = l;
3201             dst[i+2] = s;
3202         }
3203     }
3204
3205     int srccn, blueIdx;
3206     float hrange;
3207 };
3208
3209
3210 struct RGB2HLS_b
3211 {
3212     typedef uchar channel_type;
3213
3214     RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
3215     : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
3216     {
3217         #if CV_NEON
3218         v_scale_inv = vdupq_n_f32(1.f/255.f);
3219         v_scale = vdupq_n_f32(255.f);
3220         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3221         #endif
3222     }
3223
3224     void operator()(const uchar* src, uchar* dst, int n) const
3225     {
3226         int i, j, scn = srccn;
3227         float buf[3*BLOCK_SIZE];
3228
3229         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
3230         {
3231             int dn = std::min(n - i, (int)BLOCK_SIZE);
3232             j = 0;
3233
3234             #if CV_NEON
3235             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
3236             {
3237                 uint16x8_t v_t0, v_t1, v_t2;
3238
3239                 if (scn == 3)
3240                 {
3241                     uint8x8x3_t v_src = vld3_u8(src);
3242                     v_t0 = vmovl_u8(v_src.val[0]);
3243                     v_t1 = vmovl_u8(v_src.val[1]);
3244                     v_t2 = vmovl_u8(v_src.val[2]);
3245                 }
3246                 else
3247                 {
3248                     uint8x8x4_t v_src = vld4_u8(src);
3249                     v_t0 = vmovl_u8(v_src.val[0]);
3250                     v_t1 = vmovl_u8(v_src.val[1]);
3251                     v_t2 = vmovl_u8(v_src.val[2]);
3252                 }
3253
3254                 float32x4x3_t v_dst;
3255                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
3256                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
3257                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
3258                 vst3q_f32(buf + j, v_dst);
3259
3260                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
3261                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
3262                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
3263                 vst3q_f32(buf + j + 12, v_dst);
3264             }
3265             #endif
3266             for( ; j < dn*3; j += 3, src += scn )
3267             {
3268                 buf[j] = src[0]*(1.f/255.f);
3269                 buf[j+1] = src[1]*(1.f/255.f);
3270                 buf[j+2] = src[2]*(1.f/255.f);
3271             }
3272             cvt(buf, buf, dn);
3273
3274             j = 0;
3275             #if CV_NEON
3276             for ( ; j <= (dn - 8) * 3; j += 24)
3277             {
3278                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3279
3280                 uint8x8x3_t v_dst;
3281                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
3282                                                        vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
3283                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3284                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3285                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3286                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3287                 vst3_u8(dst + j, v_dst);
3288             }
3289             #endif
3290             for( ; j < dn*3; j += 3 )
3291             {
3292                 dst[j] = saturate_cast<uchar>(buf[j]);
3293                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
3294                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
3295             }
3296         }
3297     }
3298
3299     int srccn;
3300     RGB2HLS_f cvt;
3301     #if CV_NEON
3302     float32x4_t v_scale, v_scale_inv;
3303     uint8x8_t v_alpha;
3304     #endif
3305 };
3306
3307
3308 struct HLS2RGB_f
3309 {
3310     typedef float channel_type;
3311
3312     HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
3313     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
3314
3315     void operator()(const float* src, float* dst, int n) const
3316     {
3317         int i, bidx = blueIdx, dcn = dstcn;
3318         float _hscale = hscale;
3319         float alpha = ColorChannel<float>::max();
3320         n *= 3;
3321
3322         for( i = 0; i < n; i += 3, dst += dcn )
3323         {
3324             float h = src[i], l = src[i+1], s = src[i+2];
3325             float b, g, r;
3326
3327             if( s == 0 )
3328                 b = g = r = l;
3329             else
3330             {
3331                 static const int sector_data[][3]=
3332                 {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
3333                 float tab[4];
3334                 int sector;
3335
3336                 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
3337                 float p1 = 2*l - p2;
3338
3339                 h *= _hscale;
3340                 if( h < 0 )
3341                     do h += 6; while( h < 0 );
3342                 else if( h >= 6 )
3343                     do h -= 6; while( h >= 6 );
3344
3345                 assert( 0 <= h && h < 6 );
3346                 sector = cvFloor(h);
3347                 h -= sector;
3348
3349                 tab[0] = p2;
3350                 tab[1] = p1;
3351                 tab[2] = p1 + (p2 - p1)*(1-h);
3352                 tab[3] = p1 + (p2 - p1)*h;
3353
3354                 b = tab[sector_data[sector][0]];
3355                 g = tab[sector_data[sector][1]];
3356                 r = tab[sector_data[sector][2]];
3357             }
3358
3359             dst[bidx] = b;
3360             dst[1] = g;
3361             dst[bidx^2] = r;
3362             if( dcn == 4 )
3363                 dst[3] = alpha;
3364         }
3365     }
3366
3367     int dstcn, blueIdx;
3368     float hscale;
3369 };
3370
3371
3372 struct HLS2RGB_b
3373 {
3374     typedef uchar channel_type;
3375
3376     HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
3377     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
3378     {
3379         #if CV_NEON
3380         v_scale_inv = vdupq_n_f32(1.f/255.f);
3381         v_scale = vdupq_n_f32(255.f);
3382         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3383         #endif
3384     }
3385
3386     void operator()(const uchar* src, uchar* dst, int n) const
3387     {
3388         int i, j, dcn = dstcn;
3389         uchar alpha = ColorChannel<uchar>::max();
3390         float buf[3*BLOCK_SIZE];
3391
3392         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
3393         {
3394             int dn = std::min(n - i, (int)BLOCK_SIZE);
3395             j = 0;
3396
3397             #if CV_NEON
3398             for ( ; j <= (dn - 8) * 3; j += 24)
3399             {
3400                 uint8x8x3_t v_src = vld3_u8(src + j);
3401                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
3402                            v_t1 = vmovl_u8(v_src.val[1]),
3403                            v_t2 = vmovl_u8(v_src.val[2]);
3404
3405                 float32x4x3_t v_dst;
3406                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
3407                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
3408                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
3409                 vst3q_f32(buf + j, v_dst);
3410
3411                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
3412                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
3413                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
3414                 vst3q_f32(buf + j + 12, v_dst);
3415             }
3416             #endif
3417             for( ; j < dn*3; j += 3 )
3418             {
3419                 buf[j] = src[j];
3420                 buf[j+1] = src[j+1]*(1.f/255.f);
3421                 buf[j+2] = src[j+2]*(1.f/255.f);
3422             }
3423             cvt(buf, buf, dn);
3424
3425             j = 0;
3426             #if CV_NEON
3427             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
3428             {
3429                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3430                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
3431                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
3432                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3433                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3434                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3435                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3436
3437                 if (dcn == 4)
3438                 {
3439                     uint8x8x4_t v_dst;
3440                     v_dst.val[0] = v_dst0;
3441                     v_dst.val[1] = v_dst1;
3442                     v_dst.val[2] = v_dst2;
3443                     v_dst.val[3] = v_alpha;
3444                     vst4_u8(dst, v_dst);
3445                 }
3446                 else
3447                 {
3448                     uint8x8x3_t v_dst;
3449                     v_dst.val[0] = v_dst0;
3450                     v_dst.val[1] = v_dst1;
3451                     v_dst.val[2] = v_dst2;
3452                     vst3_u8(dst, v_dst);
3453                 }
3454             }
3455             #endif
3456             for( ; j < dn*3; j += 3, dst += dcn )
3457             {
3458                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
3459                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
3460                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
3461                 if( dcn == 4 )
3462                     dst[3] = alpha;
3463             }
3464         }
3465     }
3466
3467     int dstcn;
3468     HLS2RGB_f cvt;
3469     #if CV_NEON
3470     float32x4_t v_scale, v_scale_inv;
3471     uint8x8_t v_alpha;
3472     #endif
3473 };
3474
3475
3476 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
3477
3478 static const float D65[] = { 0.950456f, 1.f, 1.088754f };
3479
3480 enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };
3481 static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];
3482 static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
3483
3484 static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
3485 static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
3486
3487 static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
3488 #undef lab_shift
3489 #define lab_shift xyz_shift
3490 #define gamma_shift 3
3491 #define lab_shift2 (lab_shift + gamma_shift)
3492 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
3493 static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
3494
3495 static void initLabTabs()
3496 {
3497     static bool initialized = false;
3498     if(!initialized)
3499     {
3500         float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale;
3501         int i;
3502         for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
3503         {
3504             float x = i*scale;
3505             f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
3506         }
3507         splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
3508
3509         scale = 1.f/GammaTabScale;
3510         for(i = 0; i <= GAMMA_TAB_SIZE; i++)
3511         {
3512             float x = i*scale;
3513             g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4);
3514             ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055);
3515         }
3516         splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
3517         splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
3518
3519         for(i = 0; i < 256; i++)
3520         {
3521             float x = i*(1.f/255.f);
3522             sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4)));
3523             linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
3524         }
3525
3526         for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
3527         {
3528             float x = i*(1.f/(255.f*(1 << gamma_shift)));
3529             LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
3530         }
3531         initialized = true;
3532     }
3533 }
3534
3535 struct RGB2Lab_b
3536 {
3537     typedef uchar channel_type;
3538
3539     RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
3540               const float* _whitept, bool _srgb)
3541     : srccn(_srccn), srgb(_srgb)
3542     {
3543         static volatile int _3 = 3;
3544         initLabTabs();
3545
3546         if (!_coeffs)
3547             _coeffs = sRGB2XYZ_D65;
3548         if (!_whitept)
3549             _whitept = D65;
3550
3551         float scale[] =
3552         {
3553             (1 << lab_shift)/_whitept[0],
3554             (float)(1 << lab_shift),
3555             (1 << lab_shift)/_whitept[2]
3556         };
3557
3558         for( int i = 0; i < _3; i++ )
3559         {
3560             coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
3561             coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
3562             coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);
3563
3564             CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
3565                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
3566         }
3567     }
3568
3569     void operator()(const uchar* src, uchar* dst, int n) const
3570     {
3571         const int Lscale = (116*255+50)/100;
3572         const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
3573         const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
3574         int i, scn = srccn;
3575         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3576             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3577             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3578         n *= 3;
3579
3580         for( i = 0; i < n; i += 3, src += scn )
3581         {
3582             int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
3583             int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
3584             int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
3585             int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
3586
3587             int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
3588             int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
3589             int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
3590
3591             dst[i] = saturate_cast<uchar>(L);
3592             dst[i+1] = saturate_cast<uchar>(a);
3593             dst[i+2] = saturate_cast<uchar>(b);
3594         }
3595     }
3596
3597     int srccn;
3598     int coeffs[9];
3599     bool srgb;
3600 };
3601
3602
3603 #define clip(value) \
3604     value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value;
3605
3606 struct RGB2Lab_f
3607 {
3608     typedef float channel_type;
3609
3610     RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
3611               const float* _whitept, bool _srgb)
3612     : srccn(_srccn), srgb(_srgb)
3613     {
3614         volatile int _3 = 3;
3615         initLabTabs();
3616
3617         if (!_coeffs)
3618             _coeffs = sRGB2XYZ_D65;
3619         if (!_whitept)
3620             _whitept = D65;
3621
3622         float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
3623
3624         for( int i = 0; i < _3; i++ )
3625         {
3626             int j = i * 3;
3627             coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i];
3628             coeffs[j + 1] = _coeffs[j + 1] * scale[i];
3629             coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i];
3630
3631             CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
3632                        coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
3633         }
3634     }
3635
3636     void operator()(const float* src, float* dst, int n) const
3637     {
3638         int i, scn = srccn;
3639         float gscale = GammaTabScale;
3640         const float* gammaTab = srgb ? sRGBGammaTab : 0;
3641         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3642               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3643               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3644         n *= 3;
3645
3646         static const float _1_3 = 1.0f / 3.0f;
3647         static const float _a = 16.0f / 116.0f;
3648         for (i = 0; i < n; i += 3, src += scn )
3649         {
3650             float R = clip(src[0]);
3651             float G = clip(src[1]);
3652             float B = clip(src[2]);
3653
3654             if (gammaTab)
3655             {
3656                 R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
3657                 G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
3658                 B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
3659             }
3660             float X = R*C0 + G*C1 + B*C2;
3661             float Y = R*C3 + G*C4 + B*C5;
3662             float Z = R*C6 + G*C7 + B*C8;
3663
3664             float FX = X > 0.008856f ? std::pow(X, _1_3) : (7.787f * X + _a);
3665             float FY = Y > 0.008856f ? std::pow(Y, _1_3) : (7.787f * Y + _a);
3666             float FZ = Z > 0.008856f ? std::pow(Z, _1_3) : (7.787f * Z + _a);
3667
3668             float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
3669             float a = 500.f * (FX - FY);
3670             float b = 200.f * (FY - FZ);
3671
3672             dst[i] = L;
3673             dst[i + 1] = a;
3674             dst[i + 2] = b;
3675         }
3676     }
3677
3678     int srccn;
3679     float coeffs[9];
3680     bool srgb;
3681 };
3682
3683 struct Lab2RGB_f
3684 {
3685     typedef float channel_type;
3686
3687     Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
3688               const float* _whitept, bool _srgb )
3689     : dstcn(_dstcn), srgb(_srgb)
3690     {
3691         initLabTabs();
3692
3693         if(!_coeffs)
3694             _coeffs = XYZ2sRGB_D65;
3695         if(!_whitept)
3696             _whitept = D65;
3697
3698         for( int i = 0; i < 3; i++ )
3699         {
3700             coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
3701             coeffs[i+3] = _coeffs[i+3]*_whitept[i];
3702             coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
3703         }
3704     }
3705
3706     void operator()(const float* src, float* dst, int n) const
3707     {
3708         int i, dcn = dstcn;
3709         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
3710         float gscale = GammaTabScale;
3711         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3712         C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3713         C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3714         float alpha = ColorChannel<float>::max();
3715         n *= 3;
3716
3717         static const float lThresh = 0.008856f * 903.3f;
3718         static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
3719         for (i = 0; i < n; i += 3, dst += dcn)
3720         {
3721             float li = src[i];
3722             float ai = src[i + 1];
3723             float bi = src[i + 2];
3724
3725             float y, fy;
3726             if (li <= lThresh)
3727             {
3728                 y = li / 903.3f;
3729                 fy = 7.787f * y + 16.0f / 116.0f;
3730             }
3731             else
3732             {
3733                 fy = (li + 16.0f) / 116.0f;
3734                 y = fy * fy * fy;
3735             }
3736
3737             float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
3738
3739             for (int j = 0; j < 2; j++)
3740                 if (fxz[j] <= fThresh)
3741                     fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
3742                 else
3743                     fxz[j] = fxz[j] * fxz[j] * fxz[j];
3744
3745
3746             float x = fxz[0], z = fxz[1];
3747             float ro = C0 * x + C1 * y + C2 * z;
3748             float go = C3 * x + C4 * y + C5 * z;
3749             float bo = C6 * x + C7 * y + C8 * z;
3750             ro = clip(ro);
3751             go = clip(go);
3752             bo = clip(bo);
3753
3754             if (gammaTab)
3755             {
3756                 ro = splineInterpolate(ro * gscale, gammaTab, GAMMA_TAB_SIZE);
3757                 go = splineInterpolate(go * gscale, gammaTab, GAMMA_TAB_SIZE);
3758                 bo = splineInterpolate(bo * gscale, gammaTab, GAMMA_TAB_SIZE);
3759             }
3760
3761             dst[0] = ro, dst[1] = go, dst[2] = bo;
3762             if( dcn == 4 )
3763                 dst[3] = alpha;
3764         }
3765     }
3766
3767     int dstcn;
3768     float coeffs[9];
3769     bool srgb;
3770 };
3771
3772 #undef clip
3773
3774 struct Lab2RGB_b
3775 {
3776     typedef uchar channel_type;
3777
3778     Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
3779                const float* _whitept, bool _srgb )
3780     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
3781     {
3782         #if CV_NEON
3783         v_scale_inv = vdupq_n_f32(100.f/255.f);
3784         v_scale = vdupq_n_f32(255.f);
3785         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3786         v_128 = vdupq_n_f32(128.0f);
3787         #endif
3788     }
3789
3790     void operator()(const uchar* src, uchar* dst, int n) const
3791     {
3792         int i, j, dcn = dstcn;
3793         uchar alpha = ColorChannel<uchar>::max();
3794         float buf[3*BLOCK_SIZE];
3795
3796         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
3797         {
3798             int dn = std::min(n - i, (int)BLOCK_SIZE);
3799             j = 0;
3800
3801             #if CV_NEON
3802             for ( ; j <= (dn - 8) * 3; j += 24)
3803             {
3804                 uint8x8x3_t v_src = vld3_u8(src + j);
3805                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
3806                            v_t1 = vmovl_u8(v_src.val[1]),
3807                            v_t2 = vmovl_u8(v_src.val[2]);
3808
3809                 float32x4x3_t v_dst;
3810                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
3811                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
3812                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
3813                 vst3q_f32(buf + j, v_dst);
3814
3815                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
3816                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
3817                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
3818                 vst3q_f32(buf + j + 12, v_dst);
3819             }
3820             #endif
3821
3822             for( ; j < dn*3; j += 3 )
3823             {
3824                 buf[j] = src[j]*(100.f/255.f);
3825                 buf[j+1] = (float)(src[j+1] - 128);
3826                 buf[j+2] = (float)(src[j+2] - 128);
3827             }
3828             cvt(buf, buf, dn);
3829             j = 0;
3830
3831             #if CV_NEON
3832             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
3833             {
3834                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3835                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
3836                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
3837                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3838                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3839                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3840                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3841
3842                 if (dcn == 4)
3843                 {
3844                     uint8x8x4_t v_dst;
3845                     v_dst.val[0] = v_dst0;
3846                     v_dst.val[1] = v_dst1;
3847                     v_dst.val[2] = v_dst2;
3848                     v_dst.val[3] = v_alpha;
3849                     vst4_u8(dst, v_dst);
3850                 }
3851                 else
3852                 {
3853                     uint8x8x3_t v_dst;
3854                     v_dst.val[0] = v_dst0;
3855                     v_dst.val[1] = v_dst1;
3856                     v_dst.val[2] = v_dst2;
3857                     vst3_u8(dst, v_dst);
3858                 }
3859             }
3860             #endif
3861
3862             for( ; j < dn*3; j += 3, dst += dcn )
3863             {
3864                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
3865                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
3866                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
3867                 if( dcn == 4 )
3868                     dst[3] = alpha;
3869             }
3870         }
3871     }
3872
3873     int dstcn;
3874     Lab2RGB_f cvt;
3875
3876     #if CV_NEON
3877     float32x4_t v_scale, v_scale_inv, v_128;
3878     uint8x8_t v_alpha;
3879     #endif
3880 };
3881
3882
3883 ///////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
3884
3885 struct RGB2Luv_f
3886 {
3887     typedef float channel_type;
3888
3889     RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
3890                const float* whitept, bool _srgb )
3891     : srccn(_srccn), srgb(_srgb)
3892     {
3893         volatile int i;
3894         initLabTabs();
3895
3896         if(!_coeffs) _coeffs = sRGB2XYZ_D65;
3897         if(!whitept) whitept = D65;
3898
3899         for( i = 0; i < 3; i++ )
3900         {
3901             coeffs[i*3] = _coeffs[i*3];
3902             coeffs[i*3+1] = _coeffs[i*3+1];
3903             coeffs[i*3+2] = _coeffs[i*3+2];
3904             if( blueIdx == 0 )
3905                 std::swap(coeffs[i*3], coeffs[i*3+2]);
3906             CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
3907                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
3908         }
3909
3910         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
3911         un = 4*whitept[0]*d;
3912         vn = 9*whitept[1]*d;
3913
3914         CV_Assert(whitept[1] == 1.f);
3915     }
3916
3917     void operator()(const float* src, float* dst, int n) const
3918     {
3919         int i, scn = srccn;
3920         float gscale = GammaTabScale;
3921         const float* gammaTab = srgb ? sRGBGammaTab : 0;
3922         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3923               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3924               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3925         float _un = 13*un, _vn = 13*vn;
3926         n *= 3;
3927
3928         for( i = 0; i < n; i += 3, src += scn )
3929         {
3930             float R = src[0], G = src[1], B = src[2];
3931             if( gammaTab )
3932             {
3933                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
3934                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
3935                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
3936             }
3937
3938             float X = R*C0 + G*C1 + B*C2;
3939             float Y = R*C3 + G*C4 + B*C5;
3940             float Z = R*C6 + G*C7 + B*C8;
3941
3942             float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
3943             L = 116.f*L - 16.f;
3944
3945             float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
3946             float u = L*(X*d - _un);
3947             float v = L*((9*0.25f)*Y*d - _vn);
3948
3949             dst[i] = L; dst[i+1] = u; dst[i+2] = v;
3950         }
3951     }
3952
3953     int srccn;
3954     float coeffs[9], un, vn;
3955     bool srgb;
3956 };
3957
3958
3959 struct Luv2RGB_f
3960 {
3961     typedef float channel_type;
3962
3963     Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
3964               const float* whitept, bool _srgb )
3965     : dstcn(_dstcn), srgb(_srgb)
3966     {
3967         initLabTabs();
3968
3969         if(!_coeffs) _coeffs = XYZ2sRGB_D65;
3970         if(!whitept) whitept = D65;
3971
3972         for( int i = 0; i < 3; i++ )
3973         {
3974             coeffs[i+(blueIdx^2)*3] = _coeffs[i];
3975             coeffs[i+3] = _coeffs[i+3];
3976             coeffs[i+blueIdx*3] = _coeffs[i+6];
3977         }
3978
3979         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
3980         un = 4*whitept[0]*d;
3981         vn = 9*whitept[1]*d;
3982
3983         CV_Assert(whitept[1] == 1.f);
3984     }
3985
3986     void operator()(const float* src, float* dst, int n) const
3987     {
3988         int i, dcn = dstcn;
3989         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
3990         float gscale = GammaTabScale;
3991         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3992               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3993               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3994         float alpha = ColorChannel<float>::max();
3995         float _un = un, _vn = vn;
3996         n *= 3;
3997
3998         for( i = 0; i < n; i += 3, dst += dcn )
3999         {
4000             float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
4001             Y = (L + 16.f) * (1.f/116.f);
4002             Y = Y*Y*Y;
4003             d = (1.f/13.f)/L;
4004             u = u*d + _un;
4005             v = v*d + _vn;
4006             float iv = 1.f/v;
4007             X = 2.25f * u * Y * iv ;
4008             Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
4009
4010             float R = X*C0 + Y*C1 + Z*C2;
4011             float G = X*C3 + Y*C4 + Z*C5;
4012             float B = X*C6 + Y*C7 + Z*C8;
4013
4014             R = std::min(std::max(R, 0.f), 1.f);
4015             G = std::min(std::max(G, 0.f), 1.f);
4016             B = std::min(std::max(B, 0.f), 1.f);
4017
4018             if( gammaTab )
4019             {
4020                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
4021                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
4022                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
4023             }
4024
4025             dst[0] = R; dst[1] = G; dst[2] = B;
4026             if( dcn == 4 )
4027                 dst[3] = alpha;
4028         }
4029     }
4030
4031     int dstcn;
4032     float coeffs[9], un, vn;
4033     bool srgb;
4034 };
4035
4036
4037 struct RGB2Luv_b
4038 {
4039     typedef uchar channel_type;
4040
4041     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
4042                const float* _whitept, bool _srgb )
4043     : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb)
4044     {
4045         #if CV_NEON
4046         v_scale_inv = vdupq_n_f32(1.f/255.f);
4047         v_scale = vdupq_n_f32(2.55f);
4048         v_coeff1 = vdupq_n_f32(0.72033898305084743f);
4049         v_coeff2 = vdupq_n_f32(96.525423728813564f);
4050         v_coeff3 = vdupq_n_f32(0.9732824427480916f);
4051         v_coeff4 = vdupq_n_f32(136.259541984732824f);
4052         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4053         #endif
4054     }
4055
4056     void operator()(const uchar* src, uchar* dst, int n) const
4057     {
4058         int i, j, scn = srccn;
4059         float buf[3*BLOCK_SIZE];
4060
4061         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
4062         {
4063             int dn = std::min(n - i, (int)BLOCK_SIZE);
4064             j = 0;
4065
4066             #if CV_NEON
4067             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
4068             {
4069                 uint16x8_t v_t0, v_t1, v_t2;
4070
4071                 if (scn == 3)
4072                 {
4073                     uint8x8x3_t v_src = vld3_u8(src);
4074                     v_t0 = vmovl_u8(v_src.val[0]);
4075                     v_t1 = vmovl_u8(v_src.val[1]);
4076                     v_t2 = vmovl_u8(v_src.val[2]);
4077                 }
4078                 else
4079                 {
4080                     uint8x8x4_t v_src = vld4_u8(src);
4081                     v_t0 = vmovl_u8(v_src.val[0]);
4082                     v_t1 = vmovl_u8(v_src.val[1]);
4083                     v_t2 = vmovl_u8(v_src.val[2]);
4084                 }
4085
4086                 float32x4x3_t v_dst;
4087                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
4088                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4089                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4090                 vst3q_f32(buf + j, v_dst);
4091
4092                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
4093                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4094                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4095                 vst3q_f32(buf + j + 12, v_dst);
4096             }
4097             #endif
4098             for( ; j < dn*3; j += 3, src += scn )
4099             {
4100                 buf[j] = src[0]*(1.f/255.f);
4101                 buf[j+1] = (float)(src[1]*(1.f/255.f));
4102                 buf[j+2] = (float)(src[2]*(1.f/255.f));
4103             }
4104             cvt(buf, buf, dn);
4105
4106             j = 0;
4107             #if CV_NEON
4108             for ( ; j <= (dn - 8) * 3; j += 24)
4109             {
4110                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4111
4112                 uint8x8x3_t v_dst;
4113                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4114                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4115                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
4116                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
4117                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
4118                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
4119
4120                 vst3_u8(dst + j, v_dst);
4121             }
4122             #endif
4123
4124             for( ; j < dn*3; j += 3 )
4125             {
4126                 dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
4127                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
4128                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.9732824427480916f + 136.259541984732824f);
4129             }
4130         }
4131     }
4132
4133     int srccn;
4134     RGB2Luv_f cvt;
4135
4136     #if CV_NEON
4137     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
4138     uint8x8_t v_alpha;
4139     #endif
4140 };
4141
4142
4143 struct Luv2RGB_b
4144 {
4145     typedef uchar channel_type;
4146
4147     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
4148                const float* _whitept, bool _srgb )
4149     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
4150     {
4151         #if CV_NEON
4152         v_scale_inv = vdupq_n_f32(100.f/255.f);
4153         v_coeff1 = vdupq_n_f32(1.388235294117647f);
4154         v_coeff2 = vdupq_n_f32(1.027450980392157f);
4155         v_134 = vdupq_n_f32(134.f);
4156         v_140 = vdupq_n_f32(140.f);
4157         v_scale = vdupq_n_f32(255.f);
4158         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4159         #endif
4160     }
4161
4162     void operator()(const uchar* src, uchar* dst, int n) const
4163     {
4164         int i, j, dcn = dstcn;
4165         uchar alpha = ColorChannel<uchar>::max();
4166         float buf[3*BLOCK_SIZE];
4167
4168         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
4169         {
4170             int dn = std::min(n - i, (int)BLOCK_SIZE);
4171             j = 0;
4172
4173             #if CV_NEON
4174             for ( ; j <= (dn - 8) * 3; j += 24)
4175             {
4176                 uint8x8x3_t v_src = vld3_u8(src + j);
4177                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
4178                            v_t1 = vmovl_u8(v_src.val[1]),
4179                            v_t2 = vmovl_u8(v_src.val[2]);
4180
4181                 float32x4x3_t v_dst;
4182                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
4183                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134);
4184                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140);
4185                 vst3q_f32(buf + j, v_dst);
4186
4187                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
4188                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134);
4189                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
4190                 vst3q_f32(buf + j + 12, v_dst);
4191             }
4192             #endif
4193             for( ; j < dn*3; j += 3 )
4194             {
4195                 buf[j] = src[j]*(100.f/255.f);
4196                 buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
4197                 buf[j+2] = (float)(src[j+2]*1.027450980392157f - 140.f);
4198             }
4199             cvt(buf, buf, dn);
4200
4201             j = 0;
4202             #if CV_NEON
4203             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
4204             {
4205                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4206                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4207                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4208                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4209                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4210                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4211                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4212
4213                 if (dcn == 4)
4214                 {
4215                     uint8x8x4_t v_dst;
4216                     v_dst.val[0] = v_dst0;
4217                     v_dst.val[1] = v_dst1;
4218                     v_dst.val[2] = v_dst2;
4219                     v_dst.val[3] = v_alpha;
4220                     vst4_u8(dst, v_dst);
4221                 }
4222                 else
4223                 {
4224                     uint8x8x3_t v_dst;
4225                     v_dst.val[0] = v_dst0;
4226                     v_dst.val[1] = v_dst1;
4227                     v_dst.val[2] = v_dst2;
4228                     vst3_u8(dst, v_dst);
4229                 }
4230             }
4231             #endif
4232
4233             for( ; j < dn*3; j += 3, dst += dcn )
4234             {
4235                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
4236                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
4237                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
4238                 if( dcn == 4 )
4239                     dst[3] = alpha;
4240             }
4241         }
4242     }
4243
4244     int dstcn;
4245     Luv2RGB_f cvt;
4246
4247     #if CV_NEON
4248     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
4249     uint8x8_t v_alpha;
4250     #endif
4251 };
4252
4253
4254 ///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
4255
4256 const int ITUR_BT_601_CY = 1220542;
4257 const int ITUR_BT_601_CUB = 2116026;
4258 const int ITUR_BT_601_CUG = -409993;
4259 const int ITUR_BT_601_CVG = -852492;
4260 const int ITUR_BT_601_CVR = 1673527;
4261 const int ITUR_BT_601_SHIFT = 20;
4262
4263 // Coefficients for RGB to YUV420p conversion
4264 const int ITUR_BT_601_CRY =  269484;
4265 const int ITUR_BT_601_CGY =  528482;
4266 const int ITUR_BT_601_CBY =  102760;
4267 const int ITUR_BT_601_CRU = -155188;
4268 const int ITUR_BT_601_CGU = -305135;
4269 const int ITUR_BT_601_CBU =  460324;
4270 const int ITUR_BT_601_CGV = -385875;
4271 const int ITUR_BT_601_CBV = -74448;
4272
4273 template<int bIdx, int uIdx>
4274 struct YUV420sp2RGB888Invoker : ParallelLoopBody
4275 {
4276     Mat* dst;
4277     const uchar* my1, *muv;
4278     int width, stride;
4279
4280     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
4281         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
4282
4283     void operator()(const Range& range) const
4284     {
4285         int rangeBegin = range.start * 2;
4286         int rangeEnd = range.end * 2;
4287
4288         //R = 1.164(Y - 16) + 1.596(V - 128)
4289         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
4290         //B = 1.164(Y - 16)                  + 2.018(U - 128)
4291
4292         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
4293         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
4294         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
4295
4296         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
4297
4298 #ifdef HAVE_TEGRA_OPTIMIZATION
4299         if(tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
4300             return;
4301 #endif
4302
4303         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
4304         {
4305             uchar* row1 = dst->ptr<uchar>(j);
4306             uchar* row2 = dst->ptr<uchar>(j + 1);
4307             const uchar* y2 = y1 + stride;
4308
4309             for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
4310             {
4311                 int u = int(uv[i + 0 + uIdx]) - 128;
4312                 int v = int(uv[i + 1 - uIdx]) - 128;
4313
4314                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4315                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4316                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4317
4318                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
4319                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4320                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4321                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4322
4323                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
4324                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4325                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4326                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4327
4328                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
4329                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4330                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4331                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4332
4333                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
4334                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4335                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4336                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4337             }
4338         }
4339     }
4340 };
4341
4342 template<int bIdx, int uIdx>
4343 struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
4344 {
4345     Mat* dst;
4346     const uchar* my1, *muv;
4347     int width, stride;
4348
4349     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
4350         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
4351
4352     void operator()(const Range& range) const
4353     {
4354         int rangeBegin = range.start * 2;
4355         int rangeEnd = range.end * 2;
4356
4357         //R = 1.164(Y - 16) + 1.596(V - 128)
4358         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
4359         //B = 1.164(Y - 16)                  + 2.018(U - 128)
4360
4361         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
4362         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
4363         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
4364
4365         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
4366
4367 #ifdef HAVE_TEGRA_OPTIMIZATION
4368         if(tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
4369             return;
4370 #endif
4371
4372         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
4373         {
4374             uchar* row1 = dst->ptr<uchar>(j);
4375             uchar* row2 = dst->ptr<uchar>(j + 1);
4376             const uchar* y2 = y1 + stride;
4377
4378             for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
4379             {
4380                 int u = int(uv[i + 0 + uIdx]) - 128;
4381                 int v = int(uv[i + 1 - uIdx]) - 128;
4382
4383                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4384                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4385                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4386
4387                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
4388                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4389                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4390                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4391                 row1[3]      = uchar(0xff);
4392
4393                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
4394                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4395                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4396                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4397                 row1[7]      = uchar(0xff);
4398
4399                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
4400                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4401                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4402                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4403                 row2[3]      = uchar(0xff);
4404
4405                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
4406                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4407                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4408                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4409                 row2[7]      = uchar(0xff);
4410             }
4411         }
4412     }
4413 };
4414
4415 template<int bIdx>
4416 struct YUV420p2RGB888Invoker : ParallelLoopBody
4417 {
4418     Mat* dst;
4419     const uchar* my1, *mu, *mv;
4420     int width, stride;
4421     int ustepIdx, vstepIdx;
4422
4423     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
4424         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
4425
4426     void operator()(const Range& range) const
4427     {
4428         const int rangeBegin = range.start * 2;
4429         const int rangeEnd = range.end * 2;
4430
4431         int uvsteps[2] = {width/2, stride - width/2};
4432         int usIdx = ustepIdx, vsIdx = vstepIdx;
4433
4434         const uchar* y1 = my1 + rangeBegin * stride;
4435         const uchar* u1 = mu + (range.start / 2) * stride;
4436         const uchar* v1 = mv + (range.start / 2) * stride;
4437
4438         if(range.start % 2 == 1)
4439         {
4440             u1 += uvsteps[(usIdx++) & 1];
4441             v1 += uvsteps[(vsIdx++) & 1];
4442         }
4443
4444         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
4445         {
4446             uchar* row1 = dst->ptr<uchar>(j);
4447             uchar* row2 = dst->ptr<uchar>(j + 1);
4448             const uchar* y2 = y1 + stride;
4449
4450             for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
4451             {
4452                 int u = int(u1[i]) - 128;
4453                 int v = int(v1[i]) - 128;
4454
4455                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4456                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4457                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4458
4459                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
4460                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4461                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4462                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4463
4464                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4465                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4466                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4467                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4468
4469                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
4470                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4471                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4472                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4473
4474                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4475                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4476                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4477                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4478             }
4479         }
4480     }
4481 };
4482
4483 template<int bIdx>
4484 struct YUV420p2RGBA8888Invoker : ParallelLoopBody
4485 {
4486     Mat* dst;
4487     const uchar* my1, *mu, *mv;
4488     int width, stride;
4489     int ustepIdx, vstepIdx;
4490
4491     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
4492         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
4493
4494     void operator()(const Range& range) const
4495     {
4496         int rangeBegin = range.start * 2;
4497         int rangeEnd = range.end * 2;
4498
4499         int uvsteps[2] = {width/2, stride - width/2};
4500         int usIdx = ustepIdx, vsIdx = vstepIdx;
4501
4502         const uchar* y1 = my1 + rangeBegin * stride;
4503         const uchar* u1 = mu + (range.start / 2) * stride;
4504         const uchar* v1 = mv + (range.start / 2) * stride;
4505
4506         if(range.start % 2 == 1)
4507         {
4508             u1 += uvsteps[(usIdx++) & 1];
4509             v1 += uvsteps[(vsIdx++) & 1];
4510         }
4511
4512         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
4513         {
4514             uchar* row1 = dst->ptr<uchar>(j);
4515             uchar* row2 = dst->ptr<uchar>(j + 1);
4516             const uchar* y2 = y1 + stride;
4517
4518             for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
4519             {
4520                 int u = int(u1[i]) - 128;
4521                 int v = int(v1[i]) - 128;
4522
4523                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4524                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4525                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4526
4527                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
4528                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4529                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4530                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4531                 row1[3]      = uchar(0xff);
4532
4533                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4534                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4535                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4536                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4537                 row1[7]      = uchar(0xff);
4538
4539                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
4540                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4541                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4542                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4543                 row2[3]      = uchar(0xff);
4544
4545                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4546                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4547                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4548                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4549                 row2[7]      = uchar(0xff);
4550             }
4551         }
4552     }
4553 };
4554
4555 #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
4556
4557 template<int bIdx, int uIdx>
4558 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
4559 {
4560     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
4561     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4562         parallel_for_(Range(0, _dst.rows/2), converter);
4563     else
4564         converter(Range(0, _dst.rows/2));
4565 }
4566
4567 template<int bIdx, int uIdx>
4568 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
4569 {
4570     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
4571     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4572         parallel_for_(Range(0, _dst.rows/2), converter);
4573     else
4574         converter(Range(0, _dst.rows/2));
4575 }
4576
4577 template<int bIdx>
4578 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
4579 {
4580     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
4581     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4582         parallel_for_(Range(0, _dst.rows/2), converter);
4583     else
4584         converter(Range(0, _dst.rows/2));
4585 }
4586
4587 template<int bIdx>
4588 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
4589 {
4590     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
4591     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4592         parallel_for_(Range(0, _dst.rows/2), converter);
4593     else
4594         converter(Range(0, _dst.rows/2));
4595 }
4596
4597 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
4598
4599 template<int bIdx>
4600 struct RGB888toYUV420pInvoker: public ParallelLoopBody
4601 {
4602     RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
4603         : src_(src),
4604           dst_(dst),
4605           uIdx_(uIdx) { }
4606
4607     void operator()(const Range& rowRange) const
4608     {
4609         const int w = src_.cols;
4610         const int h = src_.rows;
4611
4612         const int cn = src_.channels();
4613         for( int i = rowRange.start; i < rowRange.end; i++ )
4614         {
4615             const uchar* row0 = src_.ptr<uchar>(2 * i);
4616             const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
4617
4618             uchar* y = dst_->ptr<uchar>(2*i);
4619             uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
4620             uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
4621             if( uIdx_ == 2 ) std::swap(u, v);
4622
4623             for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
4624             {
4625                 int r00 = row0[2-bIdx + j];      int g00 = row0[1 + j];      int b00 = row0[bIdx + j];
4626                 int r01 = row0[2-bIdx + cn + j]; int g01 = row0[1 + cn + j]; int b01 = row0[bIdx + cn + j];
4627                 int r10 = row1[2-bIdx + j];      int g10 = row1[1 + j];      int b10 = row1[bIdx + j];
4628                 int r11 = row1[2-bIdx + cn + j]; int g11 = row1[1 + cn + j]; int b11 = row1[bIdx + cn + j];
4629
4630                 const int shifted16 = (16 << ITUR_BT_601_SHIFT);
4631                 const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
4632                 int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
4633                 int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
4634                 int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
4635                 int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
4636
4637                 y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
4638                 y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
4639                 y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
4640                 y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
4641
4642                 const int shifted128 = (128 << ITUR_BT_601_SHIFT);
4643                 int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
4644                 int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
4645
4646                 u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
4647                 v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
4648             }
4649         }
4650     }
4651
4652     static bool isFit( const Mat& src )
4653     {
4654         return (src.total() >= 320*240);
4655     }
4656
4657 private:
4658     RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
4659
4660     const Mat& src_;
4661     Mat* const dst_;
4662     const int uIdx_;
4663 };
4664
4665 template<int bIdx, int uIdx>
4666 static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
4667 {
4668     RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
4669     if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
4670         parallel_for_(Range(0, src.rows/2), colorConverter);
4671     else
4672         colorConverter(Range(0, src.rows/2));
4673 }
4674
4675 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
4676
4677 template<int bIdx, int uIdx, int yIdx>
4678 struct YUV422toRGB888Invoker : ParallelLoopBody
4679 {
4680     Mat* dst;
4681     const uchar* src;
4682     int width, stride;
4683
4684     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
4685         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
4686
4687     void operator()(const Range& range) const
4688     {
4689         int rangeBegin = range.start;
4690         int rangeEnd = range.end;
4691
4692         const int uidx = 1 - yIdx + uIdx * 2;
4693         const int vidx = (2 + uidx) % 4;
4694         const uchar* yuv_src = src + rangeBegin * stride;
4695
4696         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
4697         {
4698             uchar* row = dst->ptr<uchar>(j);
4699
4700             for (int i = 0; i < 2 * width; i += 4, row += 6)
4701             {
4702                 int u = int(yuv_src[i + uidx]) - 128;
4703                 int v = int(yuv_src[i + vidx]) - 128;
4704
4705                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4706                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4707                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4708
4709                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
4710                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4711                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4712                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4713
4714                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
4715                 row[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4716                 row[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4717                 row[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4718             }
4719         }
4720     }
4721 };
4722
4723 template<int bIdx, int uIdx, int yIdx>
4724 struct YUV422toRGBA8888Invoker : ParallelLoopBody
4725 {
4726     Mat* dst;
4727     const uchar* src;
4728     int width, stride;
4729
4730     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
4731         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
4732
4733     void operator()(const Range& range) const
4734     {
4735         int rangeBegin = range.start;
4736         int rangeEnd = range.end;
4737
4738         const int uidx = 1 - yIdx + uIdx * 2;
4739         const int vidx = (2 + uidx) % 4;
4740         const uchar* yuv_src = src + rangeBegin * stride;
4741
4742         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
4743         {
4744             uchar* row = dst->ptr<uchar>(j);
4745
4746             for (int i = 0; i < 2 * width; i += 4, row += 8)
4747             {
4748                 int u = int(yuv_src[i + uidx]) - 128;
4749                 int v = int(yuv_src[i + vidx]) - 128;
4750
4751                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4752                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4753                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4754
4755                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
4756                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4757                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4758                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4759                 row[3]      = uchar(0xff);
4760
4761                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
4762                 row[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4763                 row[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4764                 row[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4765                 row[7]      = uchar(0xff);
4766             }
4767         }
4768     }
4769 };
4770
4771 #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
4772
4773 template<int bIdx, int uIdx, int yIdx>
4774 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
4775 {
4776     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
4777     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
4778         parallel_for_(Range(0, _dst.rows), converter);
4779     else
4780         converter(Range(0, _dst.rows));
4781 }
4782
4783 template<int bIdx, int uIdx, int yIdx>
4784 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
4785 {
4786     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
4787     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
4788         parallel_for_(Range(0, _dst.rows), converter);
4789     else
4790         converter(Range(0, _dst.rows));
4791 }
4792
4793 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
4794
4795 template<typename _Tp>
4796 struct RGBA2mRGBA
4797 {
4798     typedef _Tp channel_type;
4799
4800     void operator()(const _Tp* src, _Tp* dst, int n) const
4801     {
4802         _Tp max_val  = ColorChannel<_Tp>::max();
4803         _Tp half_val = ColorChannel<_Tp>::half();
4804         for( int i = 0; i < n; i++ )
4805         {
4806             _Tp v0 = *src++;
4807             _Tp v1 = *src++;
4808             _Tp v2 = *src++;
4809             _Tp v3 = *src++;
4810
4811             *dst++ = (v0 * v3 + half_val) / max_val;
4812             *dst++ = (v1 * v3 + half_val) / max_val;
4813             *dst++ = (v2 * v3 + half_val) / max_val;
4814             *dst++ = v3;
4815         }
4816     }
4817 };
4818
4819
4820 template<typename _Tp>
4821 struct mRGBA2RGBA
4822 {
4823     typedef _Tp channel_type;
4824
4825     void operator()(const _Tp* src, _Tp* dst, int n) const
4826     {
4827         _Tp max_val = ColorChannel<_Tp>::max();
4828         for( int i = 0; i < n; i++ )
4829         {
4830             _Tp v0 = *src++;
4831             _Tp v1 = *src++;
4832             _Tp v2 = *src++;
4833             _Tp v3 = *src++;
4834             _Tp v3_half = v3 / 2;
4835
4836             *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
4837             *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
4838             *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
4839             *dst++ = v3;
4840         }
4841     }
4842 };
4843
4844 #ifdef HAVE_OPENCL
4845
4846 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
4847 {
4848     bool ok = false;
4849     UMat src = _src.getUMat(), dst;
4850     Size sz = src.size(), dstSz = sz;
4851     int scn = src.channels(), depth = src.depth(), bidx, uidx, yidx;
4852     int dims = 2, stripeSize = 1;
4853     ocl::Kernel k;
4854
4855     if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
4856         return false;
4857
4858     ocl::Device dev = ocl::Device::getDefault();
4859     int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
4860     int pxPerWIx = 1;
4861
4862     size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
4863     cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
4864                              depth, scn, pxPerWIy);
4865
4866     switch (code)
4867     {
4868     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
4869     case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
4870     {
4871         CV_Assert(scn == 3 || scn == 4);
4872         dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
4873         bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
4874         k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
4875                  opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
4876                         reverse ? "REVERSE" : "ORDER"));
4877         break;
4878     }
4879     case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
4880     case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
4881     {
4882         dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
4883         CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
4884         bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
4885             code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
4886         int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
4887             code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
4888         k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
4889                  opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
4890         break;
4891     }
4892     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
4893     case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
4894     {
4895         CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
4896         bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
4897             code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
4898         int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
4899             code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
4900         dcn = 2;
4901         k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
4902                  opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
4903         break;
4904     }
4905     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
4906     {
4907         CV_Assert(scn == 2 && depth == CV_8U);
4908         dcn = 1;
4909         int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
4910         k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
4911                  opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
4912         break;
4913     }
4914     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
4915     {
4916         CV_Assert(scn == 1 && depth == CV_8U);
4917         dcn = 2;
4918         int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
4919         k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
4920                  opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
4921         break;
4922     }
4923     case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
4924     case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
4925     {
4926         CV_Assert(scn == 3 || scn == 4);
4927         bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
4928         dcn = 1;
4929         k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
4930                  opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
4931                                bidx, stripeSize));
4932         globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
4933         break;
4934     }
4935     case COLOR_GRAY2BGR:
4936     case COLOR_GRAY2BGRA:
4937     {
4938         CV_Assert(scn == 1);
4939         dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
4940         k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
4941                  opts + format("-D bidx=0 -D dcn=%d", dcn));
4942         break;
4943     }
4944     case COLOR_BGR2YUV:
4945     case COLOR_RGB2YUV:
4946     {
4947         CV_Assert(scn == 3 || scn == 4);
4948         bidx = code == COLOR_RGB2YUV ? 0 : 2;
4949         dcn = 3;
4950         k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
4951                  opts + format("-D dcn=3 -D bidx=%d", bidx));
4952         break;
4953     }
4954     case COLOR_YUV2BGR:
4955     case COLOR_YUV2RGB:
4956     {
4957         if(dcn < 0) dcn = 3;
4958         CV_Assert(dcn == 3 || dcn == 4);
4959         bidx = code == COLOR_YUV2RGB ? 0 : 2;
4960         k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
4961                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
4962         break;
4963     }
4964     case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV21:
4965     case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV21:
4966     {
4967         CV_Assert( scn == 1 );
4968         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
4969         dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ||
4970                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2RGBA_NV21 ? 4 : 3;
4971         bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ||
4972                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 0 : 2;
4973         uidx = code == COLOR_YUV2RGBA_NV21 || code == COLOR_YUV2RGB_NV21 ||
4974                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 1 : 0;
4975
4976         dstSz = Size(sz.width, sz.height * 2 / 3);
4977         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
4978         k.create("YUV2RGB_NVx", ocl::imgproc::cvtcolor_oclsrc,
4979                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
4980         break;
4981     }
4982     case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
4983     case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
4984     {
4985         CV_Assert( scn == 1 );
4986         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
4987         dcn  = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2RGBA_YV12 ||
4988                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2RGBA_IYUV ? 4 : 3;
4989         bidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
4990                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2BGR_IYUV ? 0 : 2;
4991         uidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
4992                code == COLOR_YUV2RGBA_YV12 || code == COLOR_YUV2RGB_YV12 ? 1 : 0;
4993
4994         dstSz = Size(sz.width, sz.height * 2 / 3);
4995         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
4996         k.create("YUV2RGB_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
4997                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
4998                  src.isContinuous() ? " -D SRC_CONT" : ""));
4999         break;
5000     }
5001     case COLOR_YUV2GRAY_420:
5002     {
5003         if (dcn <= 0) dcn = 1;
5004
5005         CV_Assert( dcn == 1 );
5006         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
5007
5008         dstSz = Size(sz.width, sz.height * 2 / 3);
5009         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5010         dst = _dst.getUMat();
5011
5012         src.rowRange(0, dstSz.height).copyTo(dst);
5013         return true;
5014     }
5015     case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
5016     case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
5017     {
5018         if (dcn <= 0) dcn = 1;
5019         bidx = code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ||
5020                code == COLOR_BGRA2YUV_IYUV || code == COLOR_BGR2YUV_IYUV ? 0 : 2;
5021         uidx = code == COLOR_RGBA2YUV_YV12 || code == COLOR_RGB2YUV_YV12 ||
5022                code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ? 1 : 0;
5023
5024         CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
5025         CV_Assert( dcn == 1 );
5026         CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
5027
5028         dstSz = Size(sz.width, sz.height / 2 * 3);
5029         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5030         dst = _dst.getUMat();
5031
5032         if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
5033             dst.step % 4 == 0 && dst.offset % 4 == 0)
5034         {
5035             pxPerWIx = 2;
5036         }
5037         globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
5038
5039         k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
5040                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
5041         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
5042         return k.run(2, globalsize, NULL, false);
5043     }
5044     case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
5045     case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
5046     case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
5047     {
5048         if (dcn <= 0)
5049             dcn = (code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2RGBA_YUY2 ||
5050                    code==COLOR_YUV2BGRA_YUY2 || code==COLOR_YUV2RGBA_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 4 : 3;
5051
5052         bidx = (code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2BGRA_YUY2 ||
5053                 code==COLOR_YUV2BGR_YUY2 || code==COLOR_YUV2BGRA_YVYU || code==COLOR_YUV2BGR_YVYU) ? 0 : 2;
5054         yidx = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
5055         uidx = (code==COLOR_YUV2RGB_YVYU || code==COLOR_YUV2RGBA_YVYU ||
5056                 code==COLOR_YUV2BGR_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 2 : 0;
5057         uidx = 1 - yidx + uidx;
5058
5059         CV_Assert( dcn == 3 || dcn == 4 );
5060         CV_Assert( scn == 2 && depth == CV_8U );
5061
5062         k.create("YUV2RGB_422", ocl::imgproc::cvtcolor_oclsrc,
5063                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d", dcn, bidx, uidx, yidx));
5064         break;
5065     }
5066     case COLOR_BGR2YCrCb:
5067     case COLOR_RGB2YCrCb:
5068     {
5069         CV_Assert(scn == 3 || scn == 4);
5070         bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
5071         dcn = 3;
5072         k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
5073                  opts + format("-D dcn=3 -D bidx=%d", bidx));
5074         break;
5075     }
5076     case COLOR_YCrCb2BGR:
5077     case COLOR_YCrCb2RGB:
5078     {
5079         if( dcn <= 0 )
5080             dcn = 3;
5081         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
5082         bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
5083         k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
5084                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
5085         break;
5086     }
5087     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
5088     {
5089         CV_Assert(scn == 3 || scn == 4);
5090         bidx = code == COLOR_BGR2XYZ ? 0 : 2;
5091
5092         UMat c;
5093         if (depth == CV_32F)
5094         {
5095             float coeffs[] =
5096             {
5097                 0.412453f, 0.357580f, 0.180423f,
5098                 0.212671f, 0.715160f, 0.072169f,
5099                 0.019334f, 0.119193f, 0.950227f
5100             };
5101             if (bidx == 0)
5102             {
5103                 std::swap(coeffs[0], coeffs[2]);
5104                 std::swap(coeffs[3], coeffs[5]);
5105                 std::swap(coeffs[6], coeffs[8]);
5106             }
5107             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
5108         }
5109         else
5110         {
5111             int coeffs[] =
5112             {
5113                 1689,    1465,    739,
5114                 871,     2929,    296,
5115                 79,      488,     3892
5116             };
5117             if (bidx == 0)
5118             {
5119                 std::swap(coeffs[0], coeffs[2]);
5120                 std::swap(coeffs[3], coeffs[5]);
5121                 std::swap(coeffs[6], coeffs[8]);
5122             }
5123             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
5124         }
5125
5126         _dst.create(dstSz, CV_MAKETYPE(depth, 3));
5127         dst = _dst.getUMat();
5128
5129         k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
5130                  opts + format("-D dcn=3 -D bidx=%d", bidx));
5131         if (k.empty())
5132             return false;
5133         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
5134         return k.run(2, globalsize, 0, false);
5135     }
5136     case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
5137     {
5138         if (dcn <= 0)
5139             dcn = 3;
5140         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
5141         bidx = code == COLOR_XYZ2BGR ? 0 : 2;
5142
5143         UMat c;
5144         if (depth == CV_32F)
5145         {
5146             float coeffs[] =
5147             {
5148                 3.240479f, -1.53715f, -0.498535f,
5149                 -0.969256f, 1.875991f, 0.041556f,
5150                 0.055648f, -0.204043f, 1.057311f
5151             };
5152             if (bidx == 0)
5153             {
5154                 std::swap(coeffs[0], coeffs[6]);
5155                 std::swap(coeffs[1], coeffs[7]);
5156                 std::swap(coeffs[2], coeffs[8]);
5157             }
5158             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
5159         }
5160         else
5161         {
5162             int coeffs[] =
5163             {
5164                 13273,  -6296,  -2042,
5165                 -3970,   7684,    170,
5166                   228,   -836,   4331
5167             };
5168             if (bidx == 0)
5169             {
5170                 std::swap(coeffs[0], coeffs[6]);
5171                 std::swap(coeffs[1], coeffs[7]);
5172                 std::swap(coeffs[2], coeffs[8]);
5173             }
5174             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
5175         }
5176
5177         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5178         dst = _dst.getUMat();
5179
5180         k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
5181                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
5182         if (k.empty())
5183             return false;
5184         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
5185         return k.run(2, globalsize, 0, false);
5186     }
5187     case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
5188     case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
5189     {
5190         CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
5191         bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
5192             code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
5193         int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
5194             code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
5195         bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
5196         String kernelName = String("RGB2") + (is_hsv ? "HSV" : "HLS");
5197         dcn = 3;
5198
5199         if (is_hsv && depth == CV_8U)
5200         {
5201             static UMat sdiv_data;
5202             static UMat hdiv_data180;
5203             static UMat hdiv_data256;
5204             static int sdiv_table[256];
5205             static int hdiv_table180[256];
5206             static int hdiv_table256[256];
5207             static volatile bool initialized180 = false, initialized256 = false;
5208             volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
5209
5210             if (!initialized)
5211             {
5212                 int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
5213                 UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
5214
5215                 sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
5216
5217                 int v = 255 << hsv_shift;
5218                 if (!initialized180 && !initialized256)
5219                 {
5220                     for(int i = 1; i < 256; i++ )
5221                         sdiv_table[i] = saturate_cast<int>(v/(1.*i));
5222                     Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
5223                 }
5224
5225                 v = hrange << hsv_shift;
5226                 for (int i = 1; i < 256; i++ )
5227                     hdiv_table[i] = saturate_cast<int>(v/(6.*i));
5228
5229                 Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
5230                 initialized = true;
5231             }
5232
5233             _dst.create(dstSz, CV_8UC3);
5234             dst = _dst.getUMat();
5235
5236             k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
5237                      opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
5238                                    hrange, bidx));
5239             if (k.empty())
5240                 return false;
5241
5242             k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
5243                    ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
5244                                                                        ocl::KernelArg::PtrReadOnly(hdiv_data180));
5245
5246             return k.run(2, globalsize, NULL, false);
5247         }
5248         else
5249             k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
5250                      opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
5251                                    hrange*(1.f/360.f), bidx));
5252         break;
5253     }
5254     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
5255     case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
5256     {
5257         if (dcn <= 0)
5258             dcn = 3;
5259         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
5260         bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
5261             code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
5262         int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
5263             code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
5264         bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
5265                 code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
5266
5267         String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
5268         k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
5269                  opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
5270                                dcn, bidx, hrange, 6.f/hrange));
5271         break;
5272     }
5273     case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
5274     {
5275         CV_Assert(scn == 4 && depth == CV_8U);
5276         dcn = 4;
5277
5278         k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
5279                  opts + "-D dcn=4 -D bidx=3");
5280         break;
5281     }
5282     case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
5283     case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
5284     {
5285         CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
5286
5287         bidx = code == CV_BGR2Lab || code == CV_LBGR2Lab || code == CV_BGR2Luv || code == CV_LBGR2Luv ? 0 : 2;
5288         bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_RGB2Luv || code == CV_BGR2Luv;
5289         bool lab = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_LBGR2Lab || code == CV_LRGB2Lab;
5290         float un, vn;
5291         dcn = 3;
5292
5293         k.create(format("BGR2%s", lab ? "Lab" : "Luv").c_str(),
5294                  ocl::imgproc::cvtcolor_oclsrc,
5295                  opts + format("-D dcn=%d -D bidx=%d%s",
5296                                dcn, bidx, srgb ? " -D SRGB" : ""));
5297         if (k.empty())
5298             return false;
5299
5300         initLabTabs();
5301
5302         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5303         dst = _dst.getUMat();
5304
5305         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5306                 dstarg = ocl::KernelArg::WriteOnly(dst);
5307
5308         if (depth == CV_8U && lab)
5309         {
5310             static UMat usRGBGammaTab, ulinearGammaTab, uLabCbrtTab, ucoeffs;
5311
5312             if (srgb && usRGBGammaTab.empty())
5313                 Mat(1, 256, CV_16UC1, sRGBGammaTab_b).copyTo(usRGBGammaTab);
5314             else if (ulinearGammaTab.empty())
5315                 Mat(1, 256, CV_16UC1, linearGammaTab_b).copyTo(ulinearGammaTab);
5316             if (uLabCbrtTab.empty())
5317                 Mat(1, LAB_CBRT_TAB_SIZE_B, CV_16UC1, LabCbrtTab_b).copyTo(uLabCbrtTab);
5318
5319             {
5320                 int coeffs[9];
5321                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
5322                 const float scale[] =
5323                 {
5324                     (1 << lab_shift)/_whitept[0],
5325                     (float)(1 << lab_shift),
5326                     (1 << lab_shift)/_whitept[2]
5327                 };
5328
5329                 for (int i = 0; i < 3; i++ )
5330                 {
5331                     coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]);
5332                     coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
5333                     coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]);
5334
5335                     CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
5336                               coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
5337                 }
5338                 Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs);
5339             }
5340
5341             const int Lscale = (116*255+50)/100;
5342             const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
5343
5344             k.args(srcarg, dstarg,
5345                    ocl::KernelArg::PtrReadOnly(srgb ? usRGBGammaTab : ulinearGammaTab),
5346                    ocl::KernelArg::PtrReadOnly(uLabCbrtTab), ocl::KernelArg::PtrReadOnly(ucoeffs),
5347                    Lscale, Lshift);
5348         }
5349         else
5350         {
5351             static UMat usRGBGammaTab, ucoeffs, uLabCbrtTab;
5352
5353             if (srgb && usRGBGammaTab.empty())
5354                 Mat(1, GAMMA_TAB_SIZE * 4, CV_32FC1, sRGBGammaTab).copyTo(usRGBGammaTab);
5355             if (!lab && uLabCbrtTab.empty())
5356                 Mat(1, LAB_CBRT_TAB_SIZE * 4, CV_32FC1, LabCbrtTab).copyTo(uLabCbrtTab);
5357
5358             {
5359                 float coeffs[9];
5360                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
5361                 float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
5362
5363                 for (int i = 0; i < 3; i++)
5364                 {
5365                     int j = i * 3;
5366                     coeffs[j + (bidx ^ 2)] = _coeffs[j] * (lab ? scale[i] : 1);
5367                     coeffs[j + 1] = _coeffs[j + 1] * (lab ? scale[i] : 1);
5368                     coeffs[j + bidx] = _coeffs[j + 2] * (lab ? scale[i] : 1);
5369
5370                     CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
5371                                coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) );
5372                 }
5373
5374                 float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
5375                 un = 13*4*_whitept[0]*d;
5376                 vn = 13*9*_whitept[1]*d;
5377
5378                 Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
5379             }
5380
5381             float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f;
5382             ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
5383
5384             if (lab)
5385             {
5386                 if (srgb)
5387                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
5388                            ucoeffsarg, _1_3, _a);
5389                 else
5390                     k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a);
5391             }
5392             else
5393             {
5394                 ocl::KernelArg LabCbrtTabarg = ocl::KernelArg::PtrReadOnly(uLabCbrtTab);
5395                 if (srgb)
5396                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
5397                            LabCbrtTabarg, ucoeffsarg, un, vn);
5398                 else
5399                     k.args(srcarg, dstarg, LabCbrtTabarg, ucoeffsarg, un, vn);
5400             }
5401         }
5402
5403         return k.run(dims, globalsize, NULL, false);
5404     }
5405     case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
5406     case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
5407     {
5408         if( dcn <= 0 )
5409             dcn = 3;
5410         CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
5411
5412         bidx = code == CV_Lab2BGR || code == CV_Lab2LBGR || code == CV_Luv2BGR || code == CV_Luv2LBGR ? 0 : 2;
5413         bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Luv2BGR || code == CV_Luv2RGB;
5414         bool lab = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Lab2LBGR || code == CV_Lab2LRGB;
5415         float un, vn;
5416
5417         k.create(format("%s2BGR", lab ? "Lab" : "Luv").c_str(),
5418                  ocl::imgproc::cvtcolor_oclsrc,
5419                  opts + format("-D dcn=%d -D bidx=%d%s",
5420                                dcn, bidx, srgb ? " -D SRGB" : ""));
5421         if (k.empty())
5422             return false;
5423
5424         initLabTabs();
5425         static UMat ucoeffs, usRGBInvGammaTab;
5426
5427         if (srgb && usRGBInvGammaTab.empty())
5428             Mat(1, GAMMA_TAB_SIZE*4, CV_32FC1, sRGBInvGammaTab).copyTo(usRGBInvGammaTab);
5429
5430         {
5431             float coeffs[9];
5432             const float * const _coeffs = XYZ2sRGB_D65, * const _whitept = D65;
5433
5434             for( int i = 0; i < 3; i++ )
5435             {
5436                 coeffs[i+(bidx^2)*3] = _coeffs[i] * (lab ? _whitept[i] : 1);
5437                 coeffs[i+3] = _coeffs[i+3] * (lab ? _whitept[i] : 1);
5438                 coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1);
5439             }
5440
5441             float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
5442             un = 4*_whitept[0]*d;
5443             vn = 9*_whitept[1]*d;
5444
5445             Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
5446         }
5447
5448         _dst.create(sz, CV_MAKETYPE(depth, dcn));
5449         dst = _dst.getUMat();
5450
5451         float lThresh = 0.008856f * 903.3f;
5452         float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
5453
5454         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5455                 dstarg = ocl::KernelArg::WriteOnly(dst),
5456                 coeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
5457
5458         if (lab)
5459         {
5460             if (srgb)
5461                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
5462                        coeffsarg, lThresh, fThresh);
5463             else
5464                 k.args(srcarg, dstarg, coeffsarg, lThresh, fThresh);
5465         }
5466         else
5467         {
5468             if (srgb)
5469                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
5470                        coeffsarg, un, vn);
5471             else
5472                 k.args(srcarg, dstarg, coeffsarg, un, vn);
5473         }
5474
5475         return k.run(dims, globalsize, NULL, false);
5476     }
5477     default:
5478         break;
5479     }
5480
5481     if( !k.empty() )
5482     {
5483         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5484         dst = _dst.getUMat();
5485         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
5486         ok = k.run(dims, globalsize, NULL, false);
5487     }
5488     return ok;
5489 }
5490
5491 #endif
5492
5493 }//namespace cv
5494
5495 //////////////////////////////////////////////////////////////////////////////////////////
5496 //                                   The main function                                  //
5497 //////////////////////////////////////////////////////////////////////////////////////////
5498
5499 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
5500 {
5501     int stype = _src.type();
5502     int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
5503
5504     CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() && !(depth == CV_8U && (code == CV_Luv2BGR || code == CV_Luv2RGB)),
5505                 ocl_cvtColor(_src, _dst, code, dcn) )
5506
5507     Mat src = _src.getMat(), dst;
5508     Size sz = src.size();
5509
5510     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
5511
5512     switch( code )
5513     {
5514         case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
5515         case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
5516             CV_Assert( scn == 3 || scn == 4 );
5517             dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
5518             bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
5519
5520             _dst.create( sz, CV_MAKETYPE(depth, dcn));
5521             dst = _dst.getMat();
5522
5523 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
5524             CV_IPP_CHECK()
5525             {
5526                 if( code == CV_BGR2BGRA)
5527                 {
5528                     if ( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
5529                     {
5530                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5531                         return;
5532                     }
5533                     setIppErrorStatus();
5534                 }
5535                 else if( code == CV_BGRA2BGR )
5536                 {
5537                     if ( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
5538                     {
5539                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5540                         return;
5541                     }
5542                     setIppErrorStatus();
5543                 }
5544                 else if( code == CV_BGR2RGBA )
5545                 {
5546                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
5547                     {
5548                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5549                         return;
5550                     }
5551                     setIppErrorStatus();
5552                 }
5553                 else if( code == CV_RGBA2BGR )
5554                 {
5555                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
5556                     {
5557                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5558                         return;
5559                     }
5560                     setIppErrorStatus();
5561                 }
5562                 else if( code == CV_RGB2BGR )
5563                 {
5564                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
5565                     {
5566                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5567                         return;
5568                     }
5569                     setIppErrorStatus();
5570                 }
5571 #if IPP_VERSION_X100 >= 801
5572                 else if( code == CV_RGBA2BGRA )
5573                 {
5574                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
5575                     {
5576                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5577                         return;
5578                     }
5579                     setIppErrorStatus();
5580                 }
5581 #endif
5582             }
5583 #endif
5584
5585             if( depth == CV_8U )
5586             {
5587 #ifdef HAVE_TEGRA_OPTIMIZATION
5588                 if(!tegra::cvtBGR2RGB(src, dst, bidx))
5589 #endif
5590                     CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
5591             }
5592             else if( depth == CV_16U )
5593                 CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
5594             else
5595                 CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
5596             break;
5597
5598         case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
5599         case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
5600             CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
5601             _dst.create(sz, CV_8UC2);
5602             dst = _dst.getMat();
5603
5604 #if defined(HAVE_IPP) && 0 // breaks OCL accuracy tests
5605             CV_IPP_CHECK()
5606             {
5607                 CV_SUPPRESS_DEPRECATED_START
5608
5609                 if (code == CV_BGR2BGR565 && scn == 3)
5610                 {
5611                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
5612                     {
5613                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5614                         return;
5615                     }
5616                     setIppErrorStatus();
5617                 }
5618                 else if (code == CV_BGRA2BGR565 && scn == 4)
5619                 {
5620                     if (CvtColorIPPLoopCopy(src, dst,
5621                                             IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5622                                             (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
5623                     {
5624                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5625                         return;
5626                     }
5627                     setIppErrorStatus();
5628                 }
5629                 else if (code == CV_RGB2BGR565 && scn == 3)
5630                 {
5631                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
5632                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
5633                     {
5634                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5635                         return;
5636                     }
5637                     setIppErrorStatus();
5638                 }
5639                 else if (code == CV_RGBA2BGR565 && scn == 4)
5640                 {
5641                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5642                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
5643                     {
5644                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5645                         return;
5646                     }
5647                     setIppErrorStatus();
5648                 }
5649                 CV_SUPPRESS_DEPRECATED_END
5650             }
5651 #endif
5652
5653 #ifdef HAVE_TEGRA_OPTIMIZATION
5654             if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
5655                 if(tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
5656                     break;
5657 #endif
5658
5659             CvtColorLoop(src, dst, RGB2RGB5x5(scn,
5660                       code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
5661                       code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
5662                       code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
5663                       code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
5664                                               ));
5665             break;
5666
5667         case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
5668         case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
5669             if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
5670             CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
5671             _dst.create(sz, CV_MAKETYPE(depth, dcn));
5672             dst = _dst.getMat();
5673
5674 #ifdef HAVE_IPP
5675             CV_IPP_CHECK()
5676             {
5677                 CV_SUPPRESS_DEPRECATED_START
5678                 if (code == CV_BGR5652BGR && dcn == 3)
5679                 {
5680                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
5681                     {
5682                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5683                         return;
5684                     }
5685                     setIppErrorStatus();
5686                 }
5687                 else if (code == CV_BGR5652RGB && dcn == 3)
5688                 {
5689                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
5690                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
5691                     {
5692                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5693                         return;
5694                     }
5695                     setIppErrorStatus();
5696                 }
5697                 else if (code == CV_BGR5652BGRA && dcn == 4)
5698                 {
5699                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
5700                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
5701                     {
5702                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5703                         return;
5704                     }
5705                     setIppErrorStatus();
5706                 }
5707                 else if (code == CV_BGR5652RGBA && dcn == 4)
5708                 {
5709                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
5710                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
5711                     {
5712                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5713                         return;
5714                     }
5715                     setIppErrorStatus();
5716                 }
5717                 CV_SUPPRESS_DEPRECATED_END
5718             }
5719 #endif
5720
5721             CvtColorLoop(src, dst, RGB5x52RGB(dcn,
5722                       code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
5723                       code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
5724                       code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
5725                       code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
5726                       ));
5727             break;
5728
5729         case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
5730             CV_Assert( scn == 3 || scn == 4 );
5731             _dst.create(sz, CV_MAKETYPE(depth, 1));
5732             dst = _dst.getMat();
5733
5734 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
5735             CV_IPP_CHECK()
5736             {
5737                 if( code == CV_BGR2GRAY && depth == CV_32F )
5738                 {
5739                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
5740                     {
5741                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5742                         return;
5743                     }
5744                     setIppErrorStatus();
5745                 }
5746                 else if( code == CV_RGB2GRAY && depth == CV_32F )
5747                 {
5748                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
5749                     {
5750                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5751                         return;
5752                     }
5753                     setIppErrorStatus();
5754                 }
5755                 else if( code == CV_BGRA2GRAY && depth == CV_32F )
5756                 {
5757                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
5758                     {
5759                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5760                         return;
5761                     }
5762                     setIppErrorStatus();
5763                 }
5764                 else if( code == CV_RGBA2GRAY && depth == CV_32F )
5765                 {
5766                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
5767                     {
5768                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5769                         return;
5770                     }
5771                     setIppErrorStatus();
5772                 }
5773             }
5774 #endif
5775
5776             bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
5777
5778             if( depth == CV_8U )
5779             {
5780 #ifdef HAVE_TEGRA_OPTIMIZATION
5781                 if(!tegra::cvtRGB2Gray(src, dst, bidx))
5782 #endif
5783                 CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
5784             }
5785             else if( depth == CV_16U )
5786                 CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
5787             else
5788                 CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
5789             break;
5790
5791         case CV_BGR5652GRAY: case CV_BGR5552GRAY:
5792             CV_Assert( scn == 2 && depth == CV_8U );
5793             _dst.create(sz, CV_8UC1);
5794             dst = _dst.getMat();
5795
5796             CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
5797             break;
5798
5799         case CV_GRAY2BGR: case CV_GRAY2BGRA:
5800             if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
5801             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
5802             _dst.create(sz, CV_MAKETYPE(depth, dcn));
5803             dst = _dst.getMat();
5804
5805 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
5806             CV_IPP_CHECK()
5807             {
5808                 if( code == CV_GRAY2BGR )
5809                 {
5810                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
5811                     {
5812                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5813                         return;
5814                     }
5815                     setIppErrorStatus();
5816                 }
5817                 else if( code == CV_GRAY2BGRA )
5818                 {
5819                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
5820                     {
5821                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5822                         return;
5823                     }
5824                     setIppErrorStatus();
5825                 }
5826             }
5827 #endif
5828
5829
5830             if( depth == CV_8U )
5831             {
5832 #ifdef HAVE_TEGRA_OPTIMIZATION
5833                 if(!tegra::cvtGray2RGB(src, dst))
5834 #endif
5835                 CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
5836             }
5837             else if( depth == CV_16U )
5838                 CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
5839             else
5840                 CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
5841             break;
5842
5843         case CV_GRAY2BGR565: case CV_GRAY2BGR555:
5844             CV_Assert( scn == 1 && depth == CV_8U );
5845             _dst.create(sz, CV_8UC2);
5846             dst = _dst.getMat();
5847
5848             CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
5849             break;
5850
5851         case CV_BGR2YCrCb: case CV_RGB2YCrCb:
5852         case CV_BGR2YUV: case CV_RGB2YUV:
5853             {
5854             CV_Assert( scn == 3 || scn == 4 );
5855             bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
5856             static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
5857             static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
5858             const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
5859             const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
5860
5861             _dst.create(sz, CV_MAKETYPE(depth, 3));
5862             dst = _dst.getMat();
5863
5864 #if defined HAVE_IPP && 0
5865             CV_IPP_CHECK()
5866             {
5867                 if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
5868                 {
5869                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
5870                     {
5871                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5872                         return;
5873                     }
5874                     setIppErrorStatus();
5875                 }
5876                 else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
5877                 {
5878                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
5879                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
5880                     {
5881                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5882                         return;
5883                     }
5884                     setIppErrorStatus();
5885                 }
5886                 else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
5887                 {
5888                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5889                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
5890                     {
5891                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5892                         return;
5893                     }
5894                     setIppErrorStatus();
5895                 }
5896                 else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
5897                 {
5898                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5899                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
5900                     {
5901                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5902                         return;
5903                     }
5904                     setIppErrorStatus();
5905                 }
5906             }
5907 #endif
5908
5909             if( depth == CV_8U )
5910             {
5911 #ifdef HAVE_TEGRA_OPTIMIZATION
5912                 if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::cvtRGB2YCrCb(src, dst, bidx))
5913                     break;
5914 #endif
5915                 CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
5916             }
5917             else if( depth == CV_16U )
5918                 CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
5919             else
5920                 CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
5921             }
5922             break;
5923
5924         case CV_YCrCb2BGR: case CV_YCrCb2RGB:
5925         case CV_YUV2BGR: case CV_YUV2RGB:
5926             {
5927             if( dcn <= 0 ) dcn = 3;
5928             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
5929             bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
5930             static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
5931             static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
5932             const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
5933             const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
5934
5935             _dst.create(sz, CV_MAKETYPE(depth, dcn));
5936             dst = _dst.getMat();
5937
5938 #if defined HAVE_IPP && 0
5939             CV_IPP_CHECK()
5940             {
5941                 if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
5942                 {
5943                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
5944                     {
5945                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5946                         return;
5947                     }
5948                     setIppErrorStatus();
5949                 }
5950                 else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
5951                 {
5952                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
5953                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
5954                     {
5955                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5956                         return;
5957                     }
5958                     setIppErrorStatus();
5959                 }
5960                 else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
5961                 {
5962                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
5963                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
5964                     {
5965                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5966                         return;
5967                     }
5968                     setIppErrorStatus();
5969                 }
5970                 else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
5971                 {
5972                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
5973                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
5974                     {
5975                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5976                         return;
5977                     }
5978                     setIppErrorStatus();
5979                 }
5980             }
5981 #endif
5982
5983             if( depth == CV_8U )
5984                 CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
5985             else if( depth == CV_16U )
5986                 CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
5987             else
5988                 CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
5989             }
5990             break;
5991
5992         case CV_BGR2XYZ: case CV_RGB2XYZ:
5993             CV_Assert( scn == 3 || scn == 4 );
5994             bidx = code == CV_BGR2XYZ ? 0 : 2;
5995
5996             _dst.create(sz, CV_MAKETYPE(depth, 3));
5997             dst = _dst.getMat();
5998
5999 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6000             CV_IPP_CHECK()
6001             {
6002                 if( code == CV_BGR2XYZ && scn == 3 && depth != CV_32F )
6003                 {
6004                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
6005                     {
6006                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6007                         return;
6008                     }
6009                     setIppErrorStatus();
6010                 }
6011                 else if( code == CV_BGR2XYZ && scn == 4 && depth != CV_32F )
6012                 {
6013                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
6014                     {
6015                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6016                         return;
6017                     }
6018                     setIppErrorStatus();
6019                 }
6020                 else if( code == CV_RGB2XYZ && scn == 3 && depth != CV_32F )
6021                 {
6022                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
6023                     {
6024                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6025                         return;
6026                     }
6027                     setIppErrorStatus();
6028                 }
6029                 else if( code == CV_RGB2XYZ && scn == 4 && depth != CV_32F )
6030                 {
6031                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
6032                     {
6033                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6034                         return;
6035                     }
6036                     setIppErrorStatus();
6037                 }
6038             }
6039 #endif
6040
6041             if( depth == CV_8U )
6042                 CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
6043             else if( depth == CV_16U )
6044                 CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
6045             else
6046                 CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
6047             break;
6048
6049         case CV_XYZ2BGR: case CV_XYZ2RGB:
6050             if( dcn <= 0 ) dcn = 3;
6051             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
6052             bidx = code == CV_XYZ2BGR ? 0 : 2;
6053
6054             _dst.create(sz, CV_MAKETYPE(depth, dcn));
6055             dst = _dst.getMat();
6056
6057 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6058             CV_IPP_CHECK()
6059             {
6060                 if( code == CV_XYZ2BGR && dcn == 3 && depth != CV_32F )
6061                 {
6062                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6063                     {
6064                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6065                         return;
6066                     }
6067                     setIppErrorStatus();
6068                 }
6069                 else if( code == CV_XYZ2BGR && dcn == 4 && depth != CV_32F )
6070                 {
6071                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6072                     {
6073                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6074                         return;
6075                     }
6076                     setIppErrorStatus();
6077                 }
6078                 if( code == CV_XYZ2RGB && dcn == 3 && depth != CV_32F )
6079                 {
6080                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
6081                     {
6082                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6083                         return;
6084                     }
6085                     setIppErrorStatus();
6086                 }
6087                 else if( code == CV_XYZ2RGB && dcn == 4 && depth != CV_32F )
6088                 {
6089                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6090                     {
6091                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6092                         return;
6093                     }
6094                     setIppErrorStatus();
6095                 }
6096             }
6097 #endif
6098
6099             if( depth == CV_8U )
6100                 CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
6101             else if( depth == CV_16U )
6102                 CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
6103             else
6104                 CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
6105             break;
6106
6107         case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
6108         case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
6109             {
6110             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
6111             bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
6112                 code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
6113             int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
6114                 code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
6115
6116             _dst.create(sz, CV_MAKETYPE(depth, 3));
6117             dst = _dst.getMat();
6118
6119 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6120             CV_IPP_CHECK()
6121             {
6122                 if( depth == CV_8U || depth == CV_16U )
6123                 {
6124 #if 0 // breaks OCL accuracy tests
6125                     if( code == CV_BGR2HSV_FULL && scn == 3 )
6126                     {
6127                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
6128                         {
6129                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6130                             return;
6131                         }
6132                         setIppErrorStatus();
6133                     }
6134                     else if( code == CV_BGR2HSV_FULL && scn == 4 )
6135                     {
6136                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
6137                         {
6138                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6139                             return;
6140                         }
6141                         setIppErrorStatus();
6142                     }
6143                     else if( code == CV_RGB2HSV_FULL && scn == 4 )
6144                     {
6145                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
6146                         {
6147                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6148                             return;
6149                         }
6150                         setIppErrorStatus();
6151                     } else
6152 #endif
6153                     if( code == CV_RGB2HSV_FULL && scn == 3 && depth == CV_16U )
6154                     {
6155                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HSVTab[depth])) )
6156                         {
6157                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6158                             return;
6159                         }
6160                         setIppErrorStatus();
6161                     }
6162                     else if( code == CV_BGR2HLS_FULL && scn == 3 )
6163                     {
6164                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
6165                         {
6166                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6167                             return;
6168                         }
6169                         setIppErrorStatus();
6170                     }
6171                     else if( code == CV_BGR2HLS_FULL && scn == 4 )
6172                     {
6173                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
6174                         {
6175                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6176                             return;
6177                         }
6178                         setIppErrorStatus();
6179                     }
6180                     else if( code == CV_RGB2HLS_FULL && scn == 3 )
6181                     {
6182                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
6183                         {
6184                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6185                             return;
6186                         }
6187                         setIppErrorStatus();
6188                     }
6189                     else if( code == CV_RGB2HLS_FULL && scn == 4 )
6190                     {
6191                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
6192                         {
6193                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6194                             return;
6195                         }
6196                         setIppErrorStatus();
6197                     }
6198                 }
6199             }
6200 #endif
6201
6202             if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
6203                 code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
6204             {
6205 #ifdef HAVE_TEGRA_OPTIMIZATION
6206                 if(tegra::cvtRGB2HSV(src, dst, bidx, hrange))
6207                     break;
6208 #endif
6209                 if( depth == CV_8U )
6210                     CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
6211                 else
6212                     CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
6213             }
6214             else
6215             {
6216                 if( depth == CV_8U )
6217                     CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
6218                 else
6219                     CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
6220             }
6221             }
6222             break;
6223
6224         case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
6225         case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
6226             {
6227             if( dcn <= 0 ) dcn = 3;
6228             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
6229             bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
6230                 code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
6231             int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
6232                 code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
6233
6234             _dst.create(sz, CV_MAKETYPE(depth, dcn));
6235             dst = _dst.getMat();
6236
6237 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6238             CV_IPP_CHECK()
6239             {
6240                 if( depth == CV_8U || depth == CV_16U )
6241                 {
6242                     if( code == CV_HSV2BGR_FULL && dcn == 3 )
6243                     {
6244                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6245                         {
6246                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6247                             return;
6248                         }
6249                         setIppErrorStatus();
6250                     }
6251                     else if( code == CV_HSV2BGR_FULL && dcn == 4 )
6252                     {
6253                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6254                         {
6255                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6256                             return;
6257                         }
6258                         setIppErrorStatus();
6259                     }
6260                     else if( code == CV_HSV2RGB_FULL && dcn == 3 )
6261                     {
6262                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
6263                         {
6264                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6265                             return;
6266                         }
6267                         setIppErrorStatus();
6268                     }
6269                     else if( code == CV_HSV2RGB_FULL && dcn == 4 )
6270                     {
6271                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6272                         {
6273                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6274                             return;
6275                         }
6276                         setIppErrorStatus();
6277                     }
6278                     else if( code == CV_HLS2BGR_FULL && dcn == 3 )
6279                     {
6280                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6281                         {
6282                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6283                             return;
6284                         }
6285                         setIppErrorStatus();
6286                     }
6287                     else if( code == CV_HLS2BGR_FULL && dcn == 4 )
6288                     {
6289                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6290                         {
6291                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6292                             return;
6293                         }
6294                         setIppErrorStatus();
6295                     }
6296                     else if( code == CV_HLS2RGB_FULL && dcn == 3 )
6297                     {
6298                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
6299                         {
6300                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6301                             return;
6302                         }
6303                         setIppErrorStatus();
6304                     }
6305                     else if( code == CV_HLS2RGB_FULL && dcn == 4 )
6306                     {
6307                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6308                         {
6309                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6310                             return;
6311                         }
6312                         setIppErrorStatus();
6313                     }
6314                 }
6315             }
6316 #endif
6317
6318             if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
6319                 code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
6320             {
6321                 if( depth == CV_8U )
6322                     CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
6323                 else
6324                     CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
6325             }
6326             else
6327             {
6328                 if( depth == CV_8U )
6329                     CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
6330                 else
6331                     CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
6332             }
6333             }
6334             break;
6335
6336         case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
6337         case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
6338             {
6339             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
6340             bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
6341                    code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
6342             bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
6343                         code == CV_BGR2Luv || code == CV_RGB2Luv;
6344
6345             _dst.create(sz, CV_MAKETYPE(depth, 3));
6346             dst = _dst.getMat();
6347
6348 #if defined HAVE_IPP && 0
6349             CV_IPP_CHECK()
6350             {
6351                 if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
6352                 {
6353                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
6354                     {
6355                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6356                         return;
6357                     }
6358                     setIppErrorStatus();
6359                 }
6360                 else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
6361                 {
6362                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6363                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
6364                     {
6365                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6366                         return;
6367                     }
6368                     setIppErrorStatus();
6369                 }
6370                 else
6371                 if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U) // slower than OpenCV
6372                 {
6373                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
6374                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
6375                     {
6376                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6377                         return;
6378                     }
6379                     setIppErrorStatus();
6380                 }
6381                 else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U) // slower than OpenCV
6382                 {
6383                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6384                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
6385                     {
6386                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6387                         return;
6388                     }
6389                     setIppErrorStatus();
6390                 }
6391                 else if (code == CV_LRGB2Luv && scn == 3)
6392                 {
6393                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
6394                     {
6395                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6396                         return;
6397                     }
6398                     setIppErrorStatus();
6399                 }
6400                 else if (code == CV_LRGB2Luv && scn == 4)
6401                 {
6402                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6403                                                                            ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
6404                     {
6405                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6406                         return;
6407                     }
6408                     setIppErrorStatus();
6409                 }
6410                 else if (code == CV_LBGR2Luv && scn == 3)
6411                 {
6412                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
6413                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
6414                     {
6415                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6416                         return;
6417                     }
6418                     setIppErrorStatus();
6419                 }
6420                 else if (code == CV_LBGR2Luv && scn == 4)
6421                 {
6422                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6423                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
6424                     {
6425                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6426                         return;
6427                     }
6428                     setIppErrorStatus();
6429                 }
6430             }
6431 #endif
6432
6433             if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
6434                 code == CV_LBGR2Lab || code == CV_LRGB2Lab )
6435             {
6436                 if( depth == CV_8U )
6437                     CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
6438                 else
6439                     CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
6440             }
6441             else
6442             {
6443                 if( depth == CV_8U )
6444                     CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
6445                 else
6446                     CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
6447             }
6448             }
6449             break;
6450
6451         case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
6452         case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
6453             {
6454             if( dcn <= 0 ) dcn = 3;
6455             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
6456             bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
6457                    code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
6458             bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
6459                     code == CV_Luv2BGR || code == CV_Luv2RGB;
6460
6461             _dst.create(sz, CV_MAKETYPE(depth, dcn));
6462             dst = _dst.getMat();
6463
6464 #if defined HAVE_IPP && 0
6465             CV_IPP_CHECK()
6466             {
6467                 if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
6468                 {
6469                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
6470                     {
6471                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6472                         return;
6473                     }
6474                     setIppErrorStatus();
6475                 }
6476                 else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
6477                 {
6478                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
6479                                         ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6480                     {
6481                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6482                         return;
6483                     }
6484                     setIppErrorStatus();
6485                 }
6486                 if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
6487                 {
6488                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
6489                                                                                ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6490                     {
6491                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6492                         return;
6493                     }
6494                     setIppErrorStatus();
6495                 }
6496                 else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
6497                 {
6498                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
6499                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6500                     {
6501                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6502                         return;
6503                     }
6504                     setIppErrorStatus();
6505                 }
6506                 if( code == CV_Luv2LRGB && dcn == 3 )
6507                 {
6508                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
6509                         return;
6510                 }
6511                 else if( code == CV_Luv2LRGB && dcn == 4 )
6512                 {
6513                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
6514                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6515                     {
6516                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6517                         return;
6518                     }
6519                 }
6520                 if( code == CV_Luv2LBGR && dcn == 3 )
6521                 {
6522                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
6523                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6524                     {
6525                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6526                         return;
6527                     }
6528                 }
6529                 else if( code == CV_Luv2LBGR && dcn == 4 )
6530                 {
6531                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
6532                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6533                     {
6534                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6535                         return;
6536                     }
6537                 }
6538             }
6539 #endif
6540
6541             if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
6542                 code == CV_Lab2LBGR || code == CV_Lab2LRGB )
6543             {
6544                 if( depth == CV_8U )
6545                     CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
6546                 else
6547                     CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
6548             }
6549             else
6550             {
6551                 if( depth == CV_8U )
6552                     CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
6553                 else
6554                     CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
6555             }
6556             }
6557             break;
6558
6559         case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
6560         case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
6561         case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
6562         case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
6563             demosaicing(src, _dst, code, dcn);
6564             break;
6565
6566         case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
6567         case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
6568             {
6569                 // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
6570                 // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
6571
6572                 if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
6573                 const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
6574                 const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
6575
6576                 CV_Assert( dcn == 3 || dcn == 4 );
6577                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6578
6579                 Size dstSz(sz.width, sz.height * 2 / 3);
6580                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6581                 dst = _dst.getMat();
6582
6583                 int srcstep = (int)src.step;
6584                 const uchar* y = src.ptr();
6585                 const uchar* uv = y + srcstep * dstSz.height;
6586
6587                 switch(dcn*100 + bIdx * 10 + uIdx)
6588                 {
6589                     case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
6590                     case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
6591                     case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
6592                     case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
6593                     case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
6594                     case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
6595                     case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
6596                     case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
6597                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6598                 };
6599             }
6600             break;
6601         case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
6602         case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
6603             {
6604                 //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
6605                 //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
6606
6607                 if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
6608                 const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
6609                 const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
6610
6611                 CV_Assert( dcn == 3 || dcn == 4 );
6612                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6613
6614                 Size dstSz(sz.width, sz.height * 2 / 3);
6615                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6616                 dst = _dst.getMat();
6617
6618                 int srcstep = (int)src.step;
6619                 const uchar* y = src.ptr();
6620                 const uchar* u = y + srcstep * dstSz.height;
6621                 const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
6622
6623                 int ustepIdx = 0;
6624                 int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
6625
6626                 if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
6627
6628                 switch(dcn*10 + bIdx)
6629                 {
6630                     case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6631                     case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6632                     case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6633                     case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6634                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6635                 };
6636             }
6637             break;
6638         case CV_YUV2GRAY_420:
6639             {
6640                 if (dcn <= 0) dcn = 1;
6641
6642                 CV_Assert( dcn == 1 );
6643                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6644
6645                 Size dstSz(sz.width, sz.height * 2 / 3);
6646                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6647                 dst = _dst.getMat();
6648 #if defined HAVE_IPP
6649                 CV_IPP_CHECK()
6650                 {
6651                     if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
6652                             ippiSize(dstSz.width, dstSz.height)))
6653                     {
6654                         CV_IMPL_ADD(CV_IMPL_IPP);
6655                         return;
6656                     }
6657                     setIppErrorStatus();
6658                 }
6659 #endif
6660                 src(Range(0, dstSz.height), Range::all()).copyTo(dst);
6661             }
6662             break;
6663         case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
6664         case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
6665             {
6666                 if (dcn <= 0) dcn = 1;
6667                 const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
6668                 const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
6669
6670                 CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
6671                 CV_Assert( dcn == 1 );
6672                 CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
6673
6674                 Size dstSz(sz.width, sz.height / 2 * 3);
6675                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6676                 dst = _dst.getMat();
6677
6678                 switch(bIdx + uIdx*10)
6679                 {
6680                     case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
6681                     case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
6682                     case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
6683                     case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
6684                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6685                 };
6686             }
6687             break;
6688         case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
6689         case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
6690         case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
6691             {
6692                 //http://www.fourcc.org/yuv.php#UYVY
6693                 //http://www.fourcc.org/yuv.php#YUY2
6694                 //http://www.fourcc.org/yuv.php#YVYU
6695
6696                 if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
6697                 const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
6698                 const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
6699                 const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
6700
6701                 CV_Assert( dcn == 3 || dcn == 4 );
6702                 CV_Assert( scn == 2 && depth == CV_8U );
6703
6704                 _dst.create(sz, CV_8UC(dcn));
6705                 dst = _dst.getMat();
6706
6707                 switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
6708                 {
6709                     case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6710                     case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6711                     case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6712                     case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6713                     case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6714                     case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6715                     case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6716                     case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6717                     case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6718                     case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6719                     case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6720                     case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6721                     case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6722                     case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6723                     case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6724                     case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6725                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6726                 };
6727             }
6728             break;
6729         case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
6730             {
6731                 if (dcn <= 0) dcn = 1;
6732
6733                 CV_Assert( dcn == 1 );
6734                 CV_Assert( scn == 2 && depth == CV_8U );
6735
6736                 extractChannel(_src, _dst, code == CV_YUV2GRAY_UYVY ? 1 : 0);
6737             }
6738             break;
6739         case CV_RGBA2mRGBA:
6740             {
6741                 if (dcn <= 0) dcn = 4;
6742                 CV_Assert( scn == 4 && dcn == 4 );
6743
6744                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
6745                 dst = _dst.getMat();
6746
6747                 if( depth == CV_8U )
6748                 {
6749 #if defined(HAVE_IPP)
6750                     CV_IPP_CHECK()
6751                     {
6752                         if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
6753                         {
6754                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6755                             return;
6756                         }
6757                         setIppErrorStatus();
6758                     }
6759 #endif
6760                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
6761                 }
6762                 else
6763                 {
6764                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
6765                 }
6766             }
6767             break;
6768         case CV_mRGBA2RGBA:
6769             {
6770                 if (dcn <= 0) dcn = 4;
6771                 CV_Assert( scn == 4 && dcn == 4 );
6772
6773                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
6774                 dst = _dst.getMat();
6775
6776                 if( depth == CV_8U )
6777                     CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
6778                 else
6779                 {
6780                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
6781                 }
6782             }
6783             break;
6784         default:
6785             CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
6786     }
6787 }
6788
6789 CV_IMPL void
6790 cvCvtColor( const CvArr* srcarr, CvArr* dstarr, int code )
6791 {
6792     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
6793     CV_Assert( src.depth() == dst.depth() );
6794
6795     cv::cvtColor(src, dst, code, dst.channels());
6796     CV_Assert( dst.data == dst0.data );
6797 }
6798
6799
6800 /* End of file. */