modules/imgproc/src/color.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 /********************************* COPYRIGHT NOTICE *******************************\
  44   The function for RGB to Lab conversion is based on the MATLAB script
  45   RGB2Lab.m translated by Mark Ruzon from C code by Yossi Rubner, 23 September 1997.
  46   See the page [http://vision.stanford.edu/~ruzon/software/rgblab.html]
  47 \**********************************************************************************/
  48
  49 /********************************* COPYRIGHT NOTICE *******************************\
  50   Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
  51   from MD-Mathematische Dienste GmbH. Below is the copyright notice:
  52
  53     IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  54     By downloading, copying, installing or using the software you agree
  55     to this license. If you do not agree to this license, do not download,
  56     install, copy or use the software.
  57
  58     Contributors License Agreement:
  59
  60       Copyright (c) 2002,
  61       MD-Mathematische Dienste GmbH
  62       Im Defdahl 5-10
  63       44141 Dortmund
  64       Germany
  65       www.md-it.de
  66
  67     Redistribution and use in source and binary forms,
  68     with or without modification, are permitted provided
  69     that the following conditions are met:
  70
  71     Redistributions of source code must retain
  72     the above copyright notice, this list of conditions and the following disclaimer.
  73     Redistributions in binary form must reproduce the above copyright notice,
  74     this list of conditions and the following disclaimer in the documentation
  75     and/or other materials provided with the distribution.
  76     The name of Contributor may not be used to endorse or promote products
  77     derived from this software without specific prior written permission.
  78
  79     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  80     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  81     THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  82     PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
  83     FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  84     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  85     OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  86     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  87     STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  88     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  89     THE POSSIBILITY OF SUCH DAMAGE.
  90 \**********************************************************************************/
  91
  92 #include "precomp.hpp"
  93 #include "opencl_kernels_imgproc.hpp"
  94 #include <limits>
  95
  96 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
  97
  98 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
  99 #define MAX_IPP8u   255
 100 #define MAX_IPP16u  65535
 101 #define MAX_IPP32f  1.0
 102 static IppStatus sts = ippInit();
 103 #endif
 104
 105 namespace cv
 106 {
 107
 108 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
 109 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
 110 {
 111     _Tp cn = 0;
 112     int i;
 113     tab[0] = tab[1] = (_Tp)0;
 114
 115     for(i = 1; i < n-1; i++)
 116     {
 117         _Tp t = 3*(f[i+1] - 2*f[i] + f[i-1]);
 118         _Tp l = 1/(4 - tab[(i-1)*4]);
 119         tab[i*4] = l; tab[i*4+1] = (t - tab[(i-1)*4+1])*l;
 120     }
 121
 122     for(i = n-1; i >= 0; i--)
 123     {
 124         _Tp c = tab[i*4+1] - tab[i*4]*cn;
 125         _Tp b = f[i+1] - f[i] - (cn + c*2)*(_Tp)0.3333333333333333;
 126         _Tp d = (cn - c)*(_Tp)0.3333333333333333;
 127         tab[i*4] = f[i]; tab[i*4+1] = b;
 128         tab[i*4+2] = c; tab[i*4+3] = d;
 129         cn = c;
 130     }
 131 }
 132
 133 // interpolates value of a function at x, 0 <= x <= n using a cubic spline.
 134 template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n)
 135 {
 136     // don't touch this function without urgent need - some versions of gcc fail to inline it correctly
 137     int ix = std::min(std::max(int(x), 0), n-1);
 138     x -= ix;
 139     tab += ix*4;
 140     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
 141 }
 142
 143
 144 template<typename _Tp> struct ColorChannel
 145 {
 146     typedef float worktype_f;
 147     static _Tp max() { return std::numeric_limits<_Tp>::max(); }
 148     static _Tp half() { return (_Tp)(max()/2 + 1); }
 149 };
 150
 151 template<> struct ColorChannel<float>
 152 {
 153     typedef float worktype_f;
 154     static float max() { return 1.f; }
 155     static float half() { return 0.5f; }
 156 };
 157
 158 /*template<> struct ColorChannel<double>
 159 {
 160     typedef double worktype_f;
 161     static double max() { return 1.; }
 162     static double half() { return 0.5; }
 163 };*/
 164
 165
 166 ///////////////////////////// Top-level template function ////////////////////////////////
 167
 168 template <typename Cvt>
 169 class CvtColorLoop_Invoker : public ParallelLoopBody
 170 {
 171     typedef typename Cvt::channel_type _Tp;
 172 public:
 173
 174     CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
 175         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
 176     {
 177     }
 178
 179     virtual void operator()(const Range& range) const
 180     {
 181         const uchar* yS = src.ptr<uchar>(range.start);
 182         uchar* yD = dst.ptr<uchar>(range.start);
 183
 184         for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
 185             cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
 186     }
 187
 188 private:
 189     const Mat& src;
 190     Mat& dst;
 191     const Cvt& cvt;
 192
 193     const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
 194 };
 195
 196 template <typename Cvt>
 197 void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
 198 {
 199     parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
 200 }
 201
 202 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 203
 204 typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
 205 typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
 206 typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
 207
 208 template <typename Cvt>
 209 class CvtColorIPPLoop_Invoker :
 210         public ParallelLoopBody
 211 {
 212 public:
 213
 214     CvtColorIPPLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt, bool *_ok) :
 215         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt), ok(_ok)
 216     {
 217         *ok = true;
 218     }
 219
 220     virtual void operator()(const Range& range) const
 221     {
 222         const void *yS = src.ptr<uchar>(range.start);
 223         void *yD = dst.ptr<uchar>(range.start);
 224         if( !cvt(yS, (int)src.step[0], yD, (int)dst.step[0], src.cols, range.end - range.start) )
 225             *ok = false;
 226         else
 227         {
 228             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
 229         }
 230     }
 231
 232 private:
 233     const Mat& src;
 234     Mat& dst;
 235     const Cvt& cvt;
 236     bool *ok;
 237
 238     const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
 239 };
 240
 241 template <typename Cvt>
 242 bool CvtColorIPPLoop(const Mat& src, Mat& dst, const Cvt& cvt)
 243 {
 244     bool ok;
 245     parallel_for_(Range(0, src.rows), CvtColorIPPLoop_Invoker<Cvt>(src, dst, cvt, &ok), src.total()/(double)(1<<16) );
 246     return ok;
 247 }
 248
 249 template <typename Cvt>
 250 bool CvtColorIPPLoopCopy(Mat& src, Mat& dst, const Cvt& cvt)
 251 {
 252     Mat temp;
 253     Mat &source = src;
 254     if( src.data == dst.data )
 255     {
 256         src.copyTo(temp);
 257         source = temp;
 258     }
 259     bool ok;
 260     parallel_for_(Range(0, source.rows), CvtColorIPPLoop_Invoker<Cvt>(source, dst, cvt, &ok),
 261                   source.total()/(double)(1<<16) );
 262     return ok;
 263 }
 264
 265 static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
 266          IppiSize roiSize, const int *dstOrder)
 267 {
 268     return ippiSwapChannels_8u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
 269 }
 270
 271 static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
 272          IppiSize roiSize, const int *dstOrder)
 273 {
 274     return ippiSwapChannels_16u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
 275 }
 276
 277 static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
 278          IppiSize roiSize, const int *dstOrder)
 279 {
 280     return ippiSwapChannels_32f_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
 281 }
 282
 283 static ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
 284 {
 285     (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
 286     0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
 287 };
 288
 289 static ippiGeneralFunc ippiCopyAC4C3RTab[] =
 290 {
 291     (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
 292     0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
 293 };
 294
 295 static ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
 296 {
 297     (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
 298     0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
 299 };
 300
 301 static ippiReorderFunc ippiSwapChannelsC3RTab[] =
 302 {
 303     (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
 304     0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
 305 };
 306
 307 #if IPP_VERSION_X100 >= 801
 308 static ippiReorderFunc ippiSwapChannelsC4RTab[] =
 309 {
 310     (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
 311     0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
 312 };
 313 #endif
 314
 315 static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
 316 {
 317     (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
 318     0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
 319 };
 320
 321 static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
 322 {
 323     (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
 324     0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
 325 };
 326
 327 static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
 328 {
 329     (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
 330     0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
 331 };
 332
 333 static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
 334 {
 335     (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
 336     0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
 337 };
 338
 339 static ippiGeneralFunc ippiCopyP3C3RTab[] =
 340 {
 341     (ippiGeneralFunc)ippiCopy_8u_P3C3R, 0, (ippiGeneralFunc)ippiCopy_16u_P3C3R, 0,
 342     0, (ippiGeneralFunc)ippiCopy_32f_P3C3R, 0, 0
 343 };
 344
 345 static ippiGeneralFunc ippiRGB2XYZTab[] =
 346 {
 347     (ippiGeneralFunc)ippiRGBToXYZ_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToXYZ_16u_C3R, 0,
 348     0, (ippiGeneralFunc)ippiRGBToXYZ_32f_C3R, 0, 0
 349 };
 350
 351 static ippiGeneralFunc ippiXYZ2RGBTab[] =
 352 {
 353     (ippiGeneralFunc)ippiXYZToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiXYZToRGB_16u_C3R, 0,
 354     0, (ippiGeneralFunc)ippiXYZToRGB_32f_C3R, 0, 0
 355 };
 356
 357 static ippiGeneralFunc ippiRGB2HSVTab[] =
 358 {
 359     (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
 360     0, 0, 0, 0
 361 };
 362
 363 static ippiGeneralFunc ippiHSV2RGBTab[] =
 364 {
 365     (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
 366     0, 0, 0, 0
 367 };
 368
 369 static ippiGeneralFunc ippiRGB2HLSTab[] =
 370 {
 371     (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
 372     0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
 373 };
 374
 375 static ippiGeneralFunc ippiHLS2RGBTab[] =
 376 {
 377     (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
 378     0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
 379 };
 380
 381 #if !defined(HAVE_IPP_ICV_ONLY) && 0
 382 static ippiGeneralFunc ippiRGBToLUVTab[] =
 383 {
 384     (ippiGeneralFunc)ippiRGBToLUV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToLUV_16u_C3R, 0,
 385     0, (ippiGeneralFunc)ippiRGBToLUV_32f_C3R, 0, 0
 386 };
 387
 388 static ippiGeneralFunc ippiLUVToRGBTab[] =
 389 {
 390     (ippiGeneralFunc)ippiLUVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiLUVToRGB_16u_C3R, 0,
 391     0, (ippiGeneralFunc)ippiLUVToRGB_32f_C3R, 0, 0
 392 };
 393 #endif
 394
 395 struct IPPGeneralFunctor
 396 {
 397     IPPGeneralFunctor(ippiGeneralFunc _func) : func(_func){}
 398     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 399     {
 400         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
 401     }
 402 private:
 403     ippiGeneralFunc func;
 404 };
 405
 406 struct IPPReorderFunctor
 407 {
 408     IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : func(_func)
 409     {
 410         order[0] = _order0;
 411         order[1] = _order1;
 412         order[2] = _order2;
 413         order[3] = 3;
 414     }
 415     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 416     {
 417         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
 418     }
 419 private:
 420     ippiReorderFunc func;
 421     int order[4];
 422 };
 423
 424 struct IPPColor2GrayFunctor
 425 {
 426     IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
 427         func(_func)
 428     {
 429         coeffs[0] = 0.114f;
 430         coeffs[1] = 0.587f;
 431         coeffs[2] = 0.299f;
 432     }
 433     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 434     {
 435         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
 436     }
 437 private:
 438     ippiColor2GrayFunc func;
 439     Ipp32f coeffs[3];
 440 };
 441
 442 struct IPPGray2BGRFunctor
 443 {
 444     IPPGray2BGRFunctor(ippiGeneralFunc _func) :
 445         func(_func)
 446     {
 447     }
 448
 449     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 450     {
 451         if (func == 0)
 452             return false;
 453
 454         const void* srcarray[3] = { src, src, src };
 455         return func(srcarray, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0;
 456     }
 457 private:
 458     ippiGeneralFunc func;
 459 };
 460
 461 struct IPPGray2BGRAFunctor
 462 {
 463     IPPGray2BGRAFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _depth) :
 464         func1(_func1), func2(_func2), depth(_depth)
 465     {
 466     }
 467
 468     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 469     {
 470         if (func1 == 0 || func2 == 0)
 471             return false;
 472
 473         const void* srcarray[3] = { src, src, src };
 474         Mat temp(rows, cols, CV_MAKETYPE(depth, 3));
 475         if(func1(srcarray, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
 476             return false;
 477         int order[4] = {0, 1, 2, 3};
 478         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
 479     }
 480 private:
 481     ippiGeneralFunc func1;
 482     ippiReorderFunc func2;
 483     int depth;
 484 };
 485
 486 struct IPPReorderGeneralFunctor
 487 {
 488     IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
 489         func1(_func1), func2(_func2), depth(_depth)
 490     {
 491         order[0] = _order0;
 492         order[1] = _order1;
 493         order[2] = _order2;
 494         order[3] = 3;
 495     }
 496     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 497     {
 498         if (func1 == 0 || func2 == 0)
 499             return false;
 500
 501         Mat temp;
 502         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
 503         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
 504             return false;
 505         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
 506     }
 507 private:
 508     ippiReorderFunc func1;
 509     ippiGeneralFunc func2;
 510     int order[4];
 511     int depth;
 512 };
 513
 514 struct IPPGeneralReorderFunctor
 515 {
 516     IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
 517         func1(_func1), func2(_func2), depth(_depth)
 518     {
 519         order[0] = _order0;
 520         order[1] = _order1;
 521         order[2] = _order2;
 522         order[3] = 3;
 523     }
 524     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
 525     {
 526         if (func1 == 0 || func2 == 0)
 527             return false;
 528
 529         Mat temp;
 530         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
 531         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
 532             return false;
 533         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
 534     }
 535 private:
 536     ippiGeneralFunc func1;
 537     ippiReorderFunc func2;
 538     int order[4];
 539     int depth;
 540 };
 541
 542 #endif
 543
 544 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
 545
 546 template<typename _Tp> struct RGB2RGB
 547 {
 548     typedef _Tp channel_type;
 549
 550     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
 551     void operator()(const _Tp* src, _Tp* dst, int n) const
 552     {
 553         int scn = srccn, dcn = dstcn, bidx = blueIdx;
 554         if( dcn == 3 )
 555         {
 556             n *= 3;
 557             for( int i = 0; i < n; i += 3, src += scn )
 558             {
 559                 _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
 560                 dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
 561             }
 562         }
 563         else if( scn == 3 )
 564         {
 565             n *= 3;
 566             _Tp alpha = ColorChannel<_Tp>::max();
 567             for( int i = 0; i < n; i += 3, dst += 4 )
 568             {
 569                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
 570                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
 571             }
 572         }
 573         else
 574         {
 575             n *= 4;
 576             for( int i = 0; i < n; i += 4 )
 577             {
 578                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
 579                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
 580             }
 581         }
 582     }
 583
 584     int srccn, dstcn, blueIdx;
 585 };
 586
 587 #if CV_NEON
 588
 589 template<> struct RGB2RGB<uchar>
 590 {
 591     typedef uchar channel_type;
 592
 593     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
 594         srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
 595     {
 596         v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
 597         v_alpha2 = vget_low_u8(v_alpha);
 598     }
 599
 600     void operator()(const uchar * src, uchar * dst, int n) const
 601     {
 602         int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
 603         if (dcn == 3)
 604         {
 605             n *= 3;
 606             if (scn == 3)
 607             {
 608                 for ( ; i <= n - 48; i += 48, src += 48 )
 609                 {
 610                     uint8x16x3_t v_src = vld3q_u8(src), v_dst;
 611                     v_dst.val[0] = v_src.val[bidx];
 612                     v_dst.val[1] = v_src.val[1];
 613                     v_dst.val[2] = v_src.val[bidx ^ 2];
 614                     vst3q_u8(dst + i, v_dst);
 615                 }
 616                 for ( ; i <= n - 24; i += 24, src += 24 )
 617                 {
 618                     uint8x8x3_t v_src = vld3_u8(src), v_dst;
 619                     v_dst.val[0] = v_src.val[bidx];
 620                     v_dst.val[1] = v_src.val[1];
 621                     v_dst.val[2] = v_src.val[bidx ^ 2];
 622                     vst3_u8(dst + i, v_dst);
 623                 }
 624                 for ( ; i < n; i += 3, src += 3 )
 625                 {
 626                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
 627                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
 628                 }
 629             }
 630             else
 631             {
 632                 for ( ; i <= n - 48; i += 48, src += 64 )
 633                 {
 634                     uint8x16x4_t v_src = vld4q_u8(src);
 635                     uint8x16x3_t v_dst;
 636                     v_dst.val[0] = v_src.val[bidx];
 637                     v_dst.val[1] = v_src.val[1];
 638                     v_dst.val[2] = v_src.val[bidx ^ 2];
 639                     vst3q_u8(dst + i, v_dst);
 640                 }
 641                 for ( ; i <= n - 24; i += 24, src += 32 )
 642                 {
 643                     uint8x8x4_t v_src = vld4_u8(src);
 644                     uint8x8x3_t v_dst;
 645                     v_dst.val[0] = v_src.val[bidx];
 646                     v_dst.val[1] = v_src.val[1];
 647                     v_dst.val[2] = v_src.val[bidx ^ 2];
 648                     vst3_u8(dst + i, v_dst);
 649                 }
 650                 for ( ; i < n; i += 3, src += 4 )
 651                 {
 652                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
 653                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
 654                 }
 655             }
 656         }
 657         else if (scn == 3)
 658         {
 659             n *= 3;
 660             for ( ; i <= n - 48; i += 48, dst += 64 )
 661             {
 662                 uint8x16x3_t v_src = vld3q_u8(src + i);
 663                 uint8x16x4_t v_dst;
 664                 v_dst.val[bidx] = v_src.val[0];
 665                 v_dst.val[1] = v_src.val[1];
 666                 v_dst.val[bidx ^ 2] = v_src.val[2];
 667                 v_dst.val[3] = v_alpha;
 668                 vst4q_u8(dst, v_dst);
 669             }
 670             for ( ; i <= n - 24; i += 24, dst += 32 )
 671             {
 672                 uint8x8x3_t v_src = vld3_u8(src + i);
 673                 uint8x8x4_t v_dst;
 674                 v_dst.val[bidx] = v_src.val[0];
 675                 v_dst.val[1] = v_src.val[1];
 676                 v_dst.val[bidx ^ 2] = v_src.val[2];
 677                 v_dst.val[3] = v_alpha2;
 678                 vst4_u8(dst, v_dst);
 679             }
 680             uchar alpha = ColorChannel<uchar>::max();
 681             for (; i < n; i += 3, dst += 4 )
 682             {
 683                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
 684                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
 685             }
 686         }
 687         else
 688         {
 689             n *= 4;
 690             for ( ; i <= n - 64; i += 64 )
 691             {
 692                 uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
 693                 v_dst.val[0] = v_src.val[2];
 694                 v_dst.val[1] = v_src.val[1];
 695                 v_dst.val[2] = v_src.val[0];
 696                 v_dst.val[3] = v_src.val[3];
 697                 vst4q_u8(dst + i, v_dst);
 698             }
 699             for ( ; i <= n - 32; i += 32 )
 700             {
 701                 uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
 702                 v_dst.val[0] = v_src.val[2];
 703                 v_dst.val[1] = v_src.val[1];
 704                 v_dst.val[2] = v_src.val[0];
 705                 v_dst.val[3] = v_src.val[3];
 706                 vst4_u8(dst + i, v_dst);
 707             }
 708             for ( ; i < n; i += 4)
 709             {
 710                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
 711                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
 712             }
 713         }
 714     }
 715
 716     int srccn, dstcn, blueIdx;
 717
 718     uint8x16_t v_alpha;
 719     uint8x8_t v_alpha2;
 720 };
 721
 722 #endif
 723
 724 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
 725
 726 struct RGB5x52RGB
 727 {
 728     typedef uchar channel_type;
 729
 730     RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
 731         : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
 732     {
 733         #if CV_NEON
 734         v_n3 = vdupq_n_u16(~3);
 735         v_n7 = vdupq_n_u16(~7);
 736         v_255 = vdupq_n_u8(255);
 737         v_0 = vdupq_n_u8(0);
 738         v_mask = vdupq_n_u16(0x8000);
 739         #endif
 740     }
 741
 742     void operator()(const uchar* src, uchar* dst, int n) const
 743     {
 744         int dcn = dstcn, bidx = blueIdx, i = 0;
 745         if( greenBits == 6 )
 746         {
 747             #if CV_NEON
 748             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
 749             {
 750                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
 751                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
 752                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
 753                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
 754                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
 755                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
 756                 if (dcn == 3)
 757                 {
 758                     uint8x16x3_t v_dst;
 759                     v_dst.val[bidx] = v_b;
 760                     v_dst.val[1] = v_g;
 761                     v_dst.val[bidx^2] = v_r;
 762                     vst3q_u8(dst, v_dst);
 763                 }
 764                 else
 765                 {
 766                     uint8x16x4_t v_dst;
 767                     v_dst.val[bidx] = v_b;
 768                     v_dst.val[1] = v_g;
 769                     v_dst.val[bidx^2] = v_r;
 770                     v_dst.val[3] = v_255;
 771                     vst4q_u8(dst, v_dst);
 772                 }
 773             }
 774             #endif
 775             for( ; i < n; i++, dst += dcn )
 776             {
 777                 unsigned t = ((const ushort*)src)[i];
 778                 dst[bidx] = (uchar)(t << 3);
 779                 dst[1] = (uchar)((t >> 3) & ~3);
 780                 dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
 781                 if( dcn == 4 )
 782                     dst[3] = 255;
 783             }
 784         }
 785         else
 786         {
 787             #if CV_NEON
 788             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
 789             {
 790                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
 791                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
 792                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
 793                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
 794                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
 795                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
 796                 if (dcn == 3)
 797                 {
 798                     uint8x16x3_t v_dst;
 799                     v_dst.val[bidx] = v_b;
 800                     v_dst.val[1] = v_g;
 801                     v_dst.val[bidx^2] = v_r;
 802                     vst3q_u8(dst, v_dst);
 803                 }
 804                 else
 805                 {
 806                     uint8x16x4_t v_dst;
 807                     v_dst.val[bidx] = v_b;
 808                     v_dst.val[1] = v_g;
 809                     v_dst.val[bidx^2] = v_r;
 810                     v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
 811                                                         vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
 812                     vst4q_u8(dst, v_dst);
 813                 }
 814             }
 815             #endif
 816             for( ; i < n; i++, dst += dcn )
 817             {
 818                 unsigned t = ((const ushort*)src)[i];
 819                 dst[bidx] = (uchar)(t << 3);
 820                 dst[1] = (uchar)((t >> 2) & ~7);
 821                 dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
 822                 if( dcn == 4 )
 823                     dst[3] = t & 0x8000 ? 255 : 0;
 824             }
 825         }
 826     }
 827
 828     int dstcn, blueIdx, greenBits;
 829     #if CV_NEON
 830     uint16x8_t v_n3, v_n7, v_mask;
 831     uint8x16_t v_255, v_0;
 832     #endif
 833 };
 834
 835
 836 struct RGB2RGB5x5
 837 {
 838     typedef uchar channel_type;
 839
 840     RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
 841         : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
 842     {
 843         #if CV_NEON
 844         v_n3 = vdup_n_u8(~3);
 845         v_n7 = vdup_n_u8(~7);
 846         v_mask = vdupq_n_u16(0x8000);
 847         v_0 = vdupq_n_u16(0);
 848         v_full = vdupq_n_u16(0xffff);
 849         #endif
 850     }
 851
 852     void operator()(const uchar* src, uchar* dst, int n) const
 853     {
 854         int scn = srccn, bidx = blueIdx, i = 0;
 855         if (greenBits == 6)
 856         {
 857             if (scn == 3)
 858             {
 859                 #if CV_NEON
 860                 for ( ; i <= n - 8; i += 8, src += 24 )
 861                 {
 862                     uint8x8x3_t v_src = vld3_u8(src);
 863                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 864                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
 865                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
 866                     vst1q_u16((ushort *)dst + i, v_dst);
 867                 }
 868                 #endif
 869                 for ( ; i < n; i++, src += 3 )
 870                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
 871             }
 872             else
 873             {
 874                 #if CV_NEON
 875                 for ( ; i <= n - 8; i += 8, src += 32 )
 876                 {
 877                     uint8x8x4_t v_src = vld4_u8(src);
 878                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 879                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
 880                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
 881                     vst1q_u16((ushort *)dst + i, v_dst);
 882                 }
 883                 #endif
 884                 for ( ; i < n; i++, src += 4 )
 885                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
 886             }
 887         }
 888         else if (scn == 3)
 889         {
 890             #if CV_NEON
 891             for ( ; i <= n - 8; i += 8, src += 24 )
 892             {
 893                 uint8x8x3_t v_src = vld3_u8(src);
 894                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 895                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
 896                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
 897                 vst1q_u16((ushort *)dst + i, v_dst);
 898             }
 899             #endif
 900             for ( ; i < n; i++, src += 3 )
 901                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
 902         }
 903         else
 904         {
 905             #if CV_NEON
 906             for ( ; i <= n - 8; i += 8, src += 32 )
 907             {
 908                 uint8x8x4_t v_src = vld4_u8(src);
 909                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
 910                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
 911                 v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
 912                                                    vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
 913                 vst1q_u16((ushort *)dst + i, v_dst);
 914             }
 915             #endif
 916             for ( ; i < n; i++, src += 4 )
 917                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
 918                     ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
 919         }
 920     }
 921
 922     int srccn, blueIdx, greenBits;
 923     #if CV_NEON
 924     uint8x8_t v_n3, v_n7;
 925     uint16x8_t v_mask, v_0, v_full;
 926     #endif
 927 };
 928
 929 ///////////////////////////////// Color to/from Grayscale ////////////////////////////////
 930
 931 template<typename _Tp>
 932 struct Gray2RGB
 933 {
 934     typedef _Tp channel_type;
 935
 936     Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
 937     void operator()(const _Tp* src, _Tp* dst, int n) const
 938     {
 939         if( dstcn == 3 )
 940             for( int i = 0; i < n; i++, dst += 3 )
 941             {
 942                 dst[0] = dst[1] = dst[2] = src[i];
 943             }
 944         else
 945         {
 946             _Tp alpha = ColorChannel<_Tp>::max();
 947             for( int i = 0; i < n; i++, dst += 4 )
 948             {
 949                 dst[0] = dst[1] = dst[2] = src[i];
 950                 dst[3] = alpha;
 951             }
 952         }
 953     }
 954
 955     int dstcn;
 956 };
 957
 958
 959 struct Gray2RGB5x5
 960 {
 961     typedef uchar channel_type;
 962
 963     Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
 964     {
 965         #if CV_NEON
 966         v_n7 = vdup_n_u8(~7);
 967         v_n3 = vdup_n_u8(~3);
 968         #endif
 969     }
 970
 971     void operator()(const uchar* src, uchar* dst, int n) const
 972     {
 973         int i = 0;
 974         if( greenBits == 6 )
 975         {
 976             #if CV_NEON
 977             for ( ; i <= n - 8; i += 8 )
 978             {
 979                 uint8x8_t v_src = vld1_u8(src + i);
 980                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
 981                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
 982                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
 983                 vst1q_u16((ushort *)dst + i, v_dst);
 984             }
 985             #endif
 986             for ( ; i < n; i++ )
 987             {
 988                 int t = src[i];
 989                 ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
 990             }
 991         }
 992         else
 993         {
 994             #if CV_NEON
 995             for ( ; i <= n - 8; i += 8 )
 996             {
 997                 uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
 998                 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
 999                 vst1q_u16((ushort *)dst + i, v_dst);
1000             }
1001             #endif
1002             for( ; i < n; i++ )
1003             {
1004                 int t = src[i] >> 3;
1005                 ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
1006             }
1007         }
1008     }
1009     int greenBits;
1010
1011     #if CV_NEON
1012     uint8x8_t v_n7, v_n3;
1013     #endif
1014 };
1015
1016
1017 #undef R2Y
1018 #undef G2Y
1019 #undef B2Y
1020
1021 enum
1022 {
1023     yuv_shift = 14,
1024     xyz_shift = 12,
1025     R2Y = 4899,
1026     G2Y = 9617,
1027     B2Y = 1868,
1028     BLOCK_SIZE = 256
1029 };
1030
1031
1032 struct RGB5x52Gray
1033 {
1034     typedef uchar channel_type;
1035
1036     RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
1037     {
1038         #if CV_NEON
1039         v_b2y = vdup_n_u16(B2Y);
1040         v_g2y = vdup_n_u16(G2Y);
1041         v_r2y = vdup_n_u16(R2Y);
1042         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1043         v_f8 = vdupq_n_u16(0xf8);
1044         v_fc = vdupq_n_u16(0xfc);
1045         #endif
1046     }
1047
1048     void operator()(const uchar* src, uchar* dst, int n) const
1049     {
1050         int i = 0;
1051         if( greenBits == 6 )
1052         {
1053             #if CV_NEON
1054             for ( ; i <= n - 8; i += 8)
1055             {
1056                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1057                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1058                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
1059                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
1060
1061                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1062                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1063                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1064                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1065                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1066                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1067
1068                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1069             }
1070             #endif
1071             for ( ; i < n; i++)
1072             {
1073                 int t = ((ushort*)src)[i];
1074                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1075                                            ((t >> 3) & 0xfc)*G2Y +
1076                                            ((t >> 8) & 0xf8)*R2Y, yuv_shift);
1077             }
1078         }
1079         else
1080         {
1081             #if CV_NEON
1082             for ( ; i <= n - 8; i += 8)
1083             {
1084                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1085                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1086                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
1087                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
1088
1089                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1090                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1091                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1092                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1093                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1094                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1095
1096                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1097             }
1098             #endif
1099             for ( ; i < n; i++)
1100             {
1101                 int t = ((ushort*)src)[i];
1102                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1103                                            ((t >> 2) & 0xf8)*G2Y +
1104                                            ((t >> 7) & 0xf8)*R2Y, yuv_shift);
1105             }
1106         }
1107     }
1108     int greenBits;
1109
1110     #if CV_NEON
1111     uint16x4_t v_b2y, v_g2y, v_r2y;
1112     uint32x4_t v_delta;
1113     uint16x8_t v_f8, v_fc;
1114     #endif
1115 };
1116
1117
1118 template<typename _Tp> struct RGB2Gray
1119 {
1120     typedef _Tp channel_type;
1121
1122     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1123     {
1124         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1125         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1126         if(blueIdx == 0)
1127             std::swap(coeffs[0], coeffs[2]);
1128     }
1129
1130     void operator()(const _Tp* src, _Tp* dst, int n) const
1131     {
1132         int scn = srccn;
1133         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1134         for(int i = 0; i < n; i++, src += scn)
1135             dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
1136     }
1137     int srccn;
1138     float coeffs[3];
1139 };
1140
1141 template<> struct RGB2Gray<uchar>
1142 {
1143     typedef uchar channel_type;
1144
1145     RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
1146     {
1147         const int coeffs0[] = { R2Y, G2Y, B2Y };
1148         if(!coeffs) coeffs = coeffs0;
1149
1150         int b = 0, g = 0, r = (1 << (yuv_shift-1));
1151         int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
1152
1153         for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
1154         {
1155             tab[i] = b;
1156             tab[i+256] = g;
1157             tab[i+512] = r;
1158         }
1159     }
1160     void operator()(const uchar* src, uchar* dst, int n) const
1161     {
1162         int scn = srccn;
1163         const int* _tab = tab;
1164         for(int i = 0; i < n; i++, src += scn)
1165             dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
1166     }
1167     int srccn;
1168     int tab[256*3];
1169 };
1170
1171 #if CV_NEON
1172
1173 template <>
1174 struct RGB2Gray<ushort>
1175 {
1176     typedef ushort channel_type;
1177
1178     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
1179         srccn(_srccn)
1180     {
1181         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1182         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1183         if( blueIdx == 0 )
1184             std::swap(coeffs[0], coeffs[2]);
1185
1186         v_cb = vdup_n_u16(coeffs[0]);
1187         v_cg = vdup_n_u16(coeffs[1]);
1188         v_cr = vdup_n_u16(coeffs[2]);
1189         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1190     }
1191
1192     void operator()(const ushort* src, ushort* dst, int n) const
1193     {
1194         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
1195
1196         for ( ; i <= n - 8; i += 8, src += scn * 8)
1197         {
1198             uint16x8_t v_b, v_r, v_g;
1199             if (scn == 3)
1200             {
1201                 uint16x8x3_t v_src = vld3q_u16(src);
1202                 v_b = v_src.val[0];
1203                 v_g = v_src.val[1];
1204                 v_r = v_src.val[2];
1205             }
1206             else
1207             {
1208                 uint16x8x4_t v_src = vld4q_u16(src);
1209                 v_b = v_src.val[0];
1210                 v_g = v_src.val[1];
1211                 v_r = v_src.val[2];
1212             }
1213
1214             uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
1215                                            vmull_u16(vget_low_u16(v_b), v_cb),
1216                                                      vget_low_u16(v_g), v_cg),
1217                                                      vget_low_u16(v_r), v_cr);
1218             uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
1219                                            vmull_u16(vget_high_u16(v_b), v_cb),
1220                                                      vget_high_u16(v_g), v_cg),
1221                                                      vget_high_u16(v_r), v_cr);
1222
1223             uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
1224             uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
1225
1226             vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
1227         }
1228
1229         for ( ; i <= n - 4; i += 4, src += scn * 4)
1230         {
1231             uint16x4_t v_b, v_r, v_g;
1232             if (scn == 3)
1233             {
1234                 uint16x4x3_t v_src = vld3_u16(src);
1235                 v_b = v_src.val[0];
1236                 v_g = v_src.val[1];
1237                 v_r = v_src.val[2];
1238             }
1239             else
1240             {
1241                 uint16x4x4_t v_src = vld4_u16(src);
1242                 v_b = v_src.val[0];
1243                 v_g = v_src.val[1];
1244                 v_r = v_src.val[2];
1245             }
1246
1247             uint32x4_t v_dst = vmlal_u16(vmlal_u16(
1248                                          vmull_u16(v_b, v_cb),
1249                                                    v_g, v_cg),
1250                                                    v_r, v_cr);
1251
1252             vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
1253         }
1254
1255         for( ; i < n; i++, src += scn)
1256             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1257     }
1258
1259     int srccn, coeffs[3];
1260     uint16x4_t v_cb, v_cg, v_cr;
1261     uint32x4_t v_delta;
1262 };
1263
1264 template <>
1265 struct RGB2Gray<float>
1266 {
1267     typedef float channel_type;
1268
1269     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1270     {
1271         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1272         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1273         if(blueIdx == 0)
1274             std::swap(coeffs[0], coeffs[2]);
1275
1276         v_cb = vdupq_n_f32(coeffs[0]);
1277         v_cg = vdupq_n_f32(coeffs[1]);
1278         v_cr = vdupq_n_f32(coeffs[2]);
1279     }
1280
1281     void operator()(const float * src, float * dst, int n) const
1282     {
1283         int scn = srccn, i = 0;
1284         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1285
1286         if (scn == 3)
1287         {
1288             for ( ; i <= n - 8; i += 8, src += scn * 8)
1289             {
1290                 float32x4x3_t v_src = vld3q_f32(src);
1291                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1292
1293                 v_src = vld3q_f32(src + scn * 4);
1294                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1295             }
1296
1297             for ( ; i <= n - 4; i += 4, src += scn * 4)
1298             {
1299                 float32x4x3_t v_src = vld3q_f32(src);
1300                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1301             }
1302         }
1303         else
1304         {
1305             for ( ; i <= n - 8; i += 8, src += scn * 8)
1306             {
1307                 float32x4x4_t v_src = vld4q_f32(src);
1308                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1309
1310                 v_src = vld4q_f32(src + scn * 4);
1311                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1312             }
1313
1314             for ( ; i <= n - 4; i += 4, src += scn * 4)
1315             {
1316                 float32x4x4_t v_src = vld4q_f32(src);
1317                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1318             }
1319         }
1320
1321         for ( ; i < n; i++, src += scn)
1322             dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
1323     }
1324
1325     int srccn;
1326     float coeffs[3];
1327     float32x4_t v_cb, v_cg, v_cr;
1328 };
1329
1330 #else
1331
1332 template<> struct RGB2Gray<ushort>
1333 {
1334     typedef ushort channel_type;
1335
1336     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
1337     {
1338         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1339         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1340         if( blueIdx == 0 )
1341             std::swap(coeffs[0], coeffs[2]);
1342     }
1343
1344     void operator()(const ushort* src, ushort* dst, int n) const
1345     {
1346         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1347         for(int i = 0; i < n; i++, src += scn)
1348             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1349     }
1350     int srccn;
1351     int coeffs[3];
1352 };
1353
1354 #endif
1355
1356 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
1357
1358 template<typename _Tp> struct RGB2YCrCb_f
1359 {
1360     typedef _Tp channel_type;
1361
1362     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
1363     {
1364         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1365         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1366         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1367     }
1368
1369     void operator()(const _Tp* src, _Tp* dst, int n) const
1370     {
1371         int scn = srccn, bidx = blueIdx;
1372         const _Tp delta = ColorChannel<_Tp>::half();
1373         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1374         n *= 3;
1375         for(int i = 0; i < n; i += 3, src += scn)
1376         {
1377             _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
1378             _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
1379             _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
1380             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1381         }
1382     }
1383     int srccn, blueIdx;
1384     float coeffs[5];
1385 };
1386
1387 #if CV_NEON
1388
1389 template <>
1390 struct RGB2YCrCb_f<float>
1391 {
1392     typedef float channel_type;
1393
1394     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
1395         srccn(_srccn), blueIdx(_blueIdx)
1396     {
1397         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1398         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1399         if(blueIdx==0)
1400             std::swap(coeffs[0], coeffs[2]);
1401
1402         v_c0 = vdupq_n_f32(coeffs[0]);
1403         v_c1 = vdupq_n_f32(coeffs[1]);
1404         v_c2 = vdupq_n_f32(coeffs[2]);
1405         v_c3 = vdupq_n_f32(coeffs[3]);
1406         v_c4 = vdupq_n_f32(coeffs[4]);
1407         v_delta = vdupq_n_f32(ColorChannel<float>::half());
1408     }
1409
1410     void operator()(const float * src, float * dst, int n) const
1411     {
1412         int scn = srccn, bidx = blueIdx, i = 0;
1413         const float delta = ColorChannel<float>::half();
1414         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1415         n *= 3;
1416
1417         if (scn == 3)
1418             for ( ; i <= n - 12; i += 12, src += 12)
1419             {
1420                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
1421                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1422                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1423                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1424
1425                 vst3q_f32(dst + i, v_dst);
1426             }
1427         else
1428             for ( ; i <= n - 12; i += 12, src += 16)
1429             {
1430                 float32x4x4_t v_src = vld4q_f32(src);
1431                 float32x4x3_t v_dst;
1432                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1433                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1434                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1435
1436                 vst3q_f32(dst + i, v_dst);
1437             }
1438
1439         for ( ; i < n; i += 3, src += scn)
1440         {
1441             float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
1442             float Cr = (src[bidx^2] - Y)*C3 + delta;
1443             float Cb = (src[bidx] - Y)*C4 + delta;
1444             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1445         }
1446     }
1447     int srccn, blueIdx;
1448     float coeffs[5];
1449     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
1450 };
1451
1452 #endif
1453
1454 template<typename _Tp> struct RGB2YCrCb_i
1455 {
1456     typedef _Tp channel_type;
1457
1458     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1459         : srccn(_srccn), blueIdx(_blueIdx)
1460     {
1461         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1462         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1463         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1464     }
1465     void operator()(const _Tp* src, _Tp* dst, int n) const
1466     {
1467         int scn = srccn, bidx = blueIdx;
1468         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1469         int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
1470         n *= 3;
1471         for(int i = 0; i < n; i += 3, src += scn)
1472         {
1473             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1474             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1475             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1476             dst[i] = saturate_cast<_Tp>(Y);
1477             dst[i+1] = saturate_cast<_Tp>(Cr);
1478             dst[i+2] = saturate_cast<_Tp>(Cb);
1479         }
1480     }
1481     int srccn, blueIdx;
1482     int coeffs[5];
1483 };
1484
1485 #if CV_NEON
1486
1487 template <>
1488 struct RGB2YCrCb_i<uchar>
1489 {
1490     typedef uchar channel_type;
1491
1492     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1493         : srccn(_srccn), blueIdx(_blueIdx)
1494     {
1495         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1496         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1497         if (blueIdx==0)
1498             std::swap(coeffs[0], coeffs[2]);
1499
1500         v_c0 = vdup_n_s16(coeffs[0]);
1501         v_c1 = vdup_n_s16(coeffs[1]);
1502         v_c2 = vdup_n_s16(coeffs[2]);
1503         v_c3 = vdupq_n_s32(coeffs[3]);
1504         v_c4 = vdupq_n_s32(coeffs[4]);
1505         v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
1506         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1507     }
1508
1509     void operator()(const uchar * src, uchar * dst, int n) const
1510     {
1511         int scn = srccn, bidx = blueIdx, i = 0;
1512         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1513         int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
1514         n *= 3;
1515
1516         for ( ; i <= n - 24; i += 24, src += scn * 8)
1517         {
1518             uint8x8x3_t v_dst;
1519             int16x8x3_t v_src16;
1520
1521             if (scn == 3)
1522             {
1523                 uint8x8x3_t v_src = vld3_u8(src);
1524                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1525                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1526                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1527             }
1528             else
1529             {
1530                 uint8x8x4_t v_src = vld4_u8(src);
1531                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1532                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1533                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1534             }
1535
1536             int16x4x3_t v_src0;
1537             v_src0.val[0] = vget_low_s16(v_src16.val[0]);
1538             v_src0.val[1] = vget_low_s16(v_src16.val[1]);
1539             v_src0.val[2] = vget_low_s16(v_src16.val[2]);
1540
1541             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1542             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
1543             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
1544             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
1545             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
1546             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
1547
1548             v_src0.val[0] = vget_high_s16(v_src16.val[0]);
1549             v_src0.val[1] = vget_high_s16(v_src16.val[1]);
1550             v_src0.val[2] = vget_high_s16(v_src16.val[2]);
1551
1552             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1553             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
1554             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
1555             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
1556             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
1557             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
1558
1559             v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
1560             v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
1561             v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
1562
1563             vst3_u8(dst + i, v_dst);
1564         }
1565
1566         for ( ; i < n; i += 3, src += scn)
1567         {
1568             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1569             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1570             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1571             dst[i] = saturate_cast<uchar>(Y);
1572             dst[i+1] = saturate_cast<uchar>(Cr);
1573             dst[i+2] = saturate_cast<uchar>(Cb);
1574         }
1575     }
1576     int srccn, blueIdx, coeffs[5];
1577     int16x4_t v_c0, v_c1, v_c2;
1578     int32x4_t v_c3, v_c4, v_delta, v_delta2;
1579 };
1580
1581 template <>
1582 struct RGB2YCrCb_i<ushort>
1583 {
1584     typedef ushort channel_type;
1585
1586     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1587         : srccn(_srccn), blueIdx(_blueIdx)
1588     {
1589         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1590         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1591         if (blueIdx==0)
1592             std::swap(coeffs[0], coeffs[2]);
1593
1594         v_c0 = vdupq_n_s32(coeffs[0]);
1595         v_c1 = vdupq_n_s32(coeffs[1]);
1596         v_c2 = vdupq_n_s32(coeffs[2]);
1597         v_c3 = vdupq_n_s32(coeffs[3]);
1598         v_c4 = vdupq_n_s32(coeffs[4]);
1599         v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
1600         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1601     }
1602
1603     void operator()(const ushort * src, ushort * dst, int n) const
1604     {
1605         int scn = srccn, bidx = blueIdx, i = 0;
1606         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1607         int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
1608         n *= 3;
1609
1610         for ( ; i <= n - 24; i += 24, src += scn * 8)
1611         {
1612             uint16x8x3_t v_src, v_dst;
1613             int32x4x3_t v_src0;
1614
1615             if (scn == 3)
1616                 v_src = vld3q_u16(src);
1617             else
1618             {
1619                 uint16x8x4_t v_src_ = vld4q_u16(src);
1620                 v_src.val[0] = v_src_.val[0];
1621                 v_src.val[1] = v_src_.val[1];
1622                 v_src.val[2] = v_src_.val[2];
1623             }
1624
1625             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
1626             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
1627             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
1628
1629             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1630             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
1631             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
1632             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
1633             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
1634             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
1635
1636             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
1637             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
1638             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
1639
1640             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1641             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
1642             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
1643             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
1644             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
1645             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
1646
1647             v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
1648             v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
1649             v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
1650
1651             vst3q_u16(dst + i, v_dst);
1652         }
1653
1654         for ( ; i <= n - 12; i += 12, src += scn * 4)
1655         {
1656             uint16x4x3_t v_dst;
1657             int32x4x3_t v_src0;
1658
1659             if (scn == 3)
1660             {
1661                 uint16x4x3_t v_src = vld3_u16(src);
1662                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
1663                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
1664                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
1665             }
1666             else
1667             {
1668                 uint16x4x4_t v_src = vld4_u16(src);
1669                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
1670                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
1671                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
1672             }
1673
1674             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1675             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
1676             int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
1677             v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
1678             int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
1679             v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
1680
1681             v_dst.val[0] = vqmovun_s32(v_Y);
1682             v_dst.val[1] = vqmovun_s32(v_Cr);
1683             v_dst.val[2] = vqmovun_s32(v_Cb);
1684
1685             vst3_u16(dst + i, v_dst);
1686         }
1687
1688         for ( ; i < n; i += 3, src += scn)
1689         {
1690             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1691             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1692             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1693             dst[i] = saturate_cast<ushort>(Y);
1694             dst[i+1] = saturate_cast<ushort>(Cr);
1695             dst[i+2] = saturate_cast<ushort>(Cb);
1696         }
1697     }
1698     int srccn, blueIdx, coeffs[5];
1699     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
1700 };
1701
1702 #endif
1703
1704 template<typename _Tp> struct YCrCb2RGB_f
1705 {
1706     typedef _Tp channel_type;
1707
1708     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
1709         : dstcn(_dstcn), blueIdx(_blueIdx)
1710     {
1711         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
1712         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1713     }
1714     void operator()(const _Tp* src, _Tp* dst, int n) const
1715     {
1716         int dcn = dstcn, bidx = blueIdx;
1717         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
1718         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1719         n *= 3;
1720         for(int i = 0; i < n; i += 3, dst += dcn)
1721         {
1722             _Tp Y = src[i];
1723             _Tp Cr = src[i+1];
1724             _Tp Cb = src[i+2];
1725
1726             _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
1727             _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
1728             _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
1729
1730             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
1731             if( dcn == 4 )
1732                 dst[3] = alpha;
1733         }
1734     }
1735     int dstcn, blueIdx;
1736     float coeffs[4];
1737 };
1738
1739 #if CV_NEON
1740
1741 template <>
1742 struct YCrCb2RGB_f<float>
1743 {
1744     typedef float channel_type;
1745
1746     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
1747         : dstcn(_dstcn), blueIdx(_blueIdx)
1748     {
1749         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
1750         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1751
1752         v_c0 = vdupq_n_f32(coeffs[0]);
1753         v_c1 = vdupq_n_f32(coeffs[1]);
1754         v_c2 = vdupq_n_f32(coeffs[2]);
1755         v_c3 = vdupq_n_f32(coeffs[3]);
1756         v_delta = vdupq_n_f32(ColorChannel<float>::half());
1757         v_alpha = vdupq_n_f32(ColorChannel<float>::max());
1758     }
1759
1760     void operator()(const float* src, float* dst, int n) const
1761     {
1762         int dcn = dstcn, bidx = blueIdx, i = 0;
1763         const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
1764         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1765         n *= 3;
1766
1767         if (dcn == 3)
1768             for ( ; i <= n - 12; i += 12, dst += 12)
1769             {
1770                 float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
1771                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
1772
1773                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
1774                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
1775                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
1776
1777                 vst3q_f32(dst, v_dst);
1778             }
1779         else
1780             for ( ; i <= n - 12; i += 12, dst += 16)
1781             {
1782                 float32x4x3_t v_src = vld3q_f32(src + i);
1783                 float32x4x4_t v_dst;
1784                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
1785
1786                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
1787                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
1788                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
1789                 v_dst.val[3] = v_alpha;
1790
1791                 vst4q_f32(dst, v_dst);
1792             }
1793
1794         for ( ; i < n; i += 3, dst += dcn)
1795         {
1796             float Y = src[i], Cr = src[i+1], Cb = src[i+2];
1797
1798             float b = Y + (Cb - delta)*C3;
1799             float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
1800             float r = Y + (Cr - delta)*C0;
1801
1802             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
1803             if( dcn == 4 )
1804                 dst[3] = alpha;
1805         }
1806     }
1807     int dstcn, blueIdx;
1808     float coeffs[4];
1809     float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
1810 };
1811
1812 #endif
1813
1814 template<typename _Tp> struct YCrCb2RGB_i
1815 {
1816     typedef _Tp channel_type;
1817
1818     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
1819         : dstcn(_dstcn), blueIdx(_blueIdx)
1820     {
1821         static const int coeffs0[] = {22987, -11698, -5636, 29049};
1822         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1823     }
1824
1825     void operator()(const _Tp* src, _Tp* dst, int n) const
1826     {
1827         int dcn = dstcn, bidx = blueIdx;
1828         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
1829         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1830         n *= 3;
1831         for(int i = 0; i < n; i += 3, dst += dcn)
1832         {
1833             _Tp Y = src[i];
1834             _Tp Cr = src[i+1];
1835             _Tp Cb = src[i+2];
1836
1837             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
1838             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
1839             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
1840
1841             dst[bidx] = saturate_cast<_Tp>(b);
1842             dst[1] = saturate_cast<_Tp>(g);
1843             dst[bidx^2] = saturate_cast<_Tp>(r);
1844             if( dcn == 4 )
1845                 dst[3] = alpha;
1846         }
1847     }
1848     int dstcn, blueIdx;
1849     int coeffs[4];
1850 };
1851
1852 #if CV_NEON
1853
1854 template <>
1855 struct YCrCb2RGB_i<uchar>
1856 {
1857     typedef uchar channel_type;
1858
1859     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
1860         : dstcn(_dstcn), blueIdx(_blueIdx)
1861     {
1862         static const int coeffs0[] = {22987, -11698, -5636, 29049};
1863         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1864
1865         v_c0 = vdupq_n_s32(coeffs[0]);
1866         v_c1 = vdupq_n_s32(coeffs[1]);
1867         v_c2 = vdupq_n_s32(coeffs[2]);
1868         v_c3 = vdupq_n_s32(coeffs[3]);
1869         v_delta = vdup_n_s16(ColorChannel<uchar>::half());
1870         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1871         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
1872     }
1873
1874     void operator()(const uchar* src, uchar* dst, int n) const
1875     {
1876         int dcn = dstcn, bidx = blueIdx, i = 0;
1877         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
1878         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1879         n *= 3;
1880
1881         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
1882         {
1883             uint8x8x3_t v_src = vld3_u8(src + i);
1884             int16x8x3_t v_src16;
1885             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1886             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1887             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1888
1889             int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
1890                       v_Cr = vget_low_s16(v_src16.val[1]),
1891                       v_Cb = vget_low_s16(v_src16.val[2]);
1892
1893             int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
1894             v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
1895             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
1896             v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
1897             int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
1898             v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
1899
1900             v_Y = vget_high_s16(v_src16.val[0]);
1901             v_Cr = vget_high_s16(v_src16.val[1]);
1902             v_Cb = vget_high_s16(v_src16.val[2]);
1903
1904             int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
1905             v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
1906             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
1907             v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
1908             int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
1909             v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
1910
1911             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
1912             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
1913             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
1914
1915             if (dcn == 3)
1916             {
1917                 uint8x8x3_t v_dst;
1918                 v_dst.val[bidx] = v_b;
1919                 v_dst.val[1] = v_g;
1920                 v_dst.val[bidx^2] = v_r;
1921                 vst3_u8(dst, v_dst);
1922             }
1923             else
1924             {
1925                 uint8x8x4_t v_dst;
1926                 v_dst.val[bidx] = v_b;
1927                 v_dst.val[1] = v_g;
1928                 v_dst.val[bidx^2] = v_r;
1929                 v_dst.val[3] = v_alpha;
1930                 vst4_u8(dst, v_dst);
1931             }
1932         }
1933
1934         for ( ; i < n; i += 3, dst += dcn)
1935         {
1936             uchar Y = src[i];
1937             uchar Cr = src[i+1];
1938             uchar Cb = src[i+2];
1939
1940             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
1941             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
1942             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
1943
1944             dst[bidx] = saturate_cast<uchar>(b);
1945             dst[1] = saturate_cast<uchar>(g);
1946             dst[bidx^2] = saturate_cast<uchar>(r);
1947             if( dcn == 4 )
1948                 dst[3] = alpha;
1949         }
1950     }
1951     int dstcn, blueIdx;
1952     int coeffs[4];
1953
1954     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
1955     int16x4_t v_delta;
1956     uint8x8_t v_alpha;
1957 };
1958
1959 template <>
1960 struct YCrCb2RGB_i<ushort>
1961 {
1962     typedef ushort channel_type;
1963
1964     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
1965         : dstcn(_dstcn), blueIdx(_blueIdx)
1966     {
1967         static const int coeffs0[] = {22987, -11698, -5636, 29049};
1968         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
1969
1970         v_c0 = vdupq_n_s32(coeffs[0]);
1971         v_c1 = vdupq_n_s32(coeffs[1]);
1972         v_c2 = vdupq_n_s32(coeffs[2]);
1973         v_c3 = vdupq_n_s32(coeffs[3]);
1974         v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
1975         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1976         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
1977         v_alpha2 = vget_low_u16(v_alpha);
1978     }
1979
1980     void operator()(const ushort* src, ushort* dst, int n) const
1981     {
1982         int dcn = dstcn, bidx = blueIdx, i = 0;
1983         const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
1984         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
1985         n *= 3;
1986
1987         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
1988         {
1989             uint16x8x3_t v_src = vld3q_u16(src + i);
1990
1991             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
1992                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
1993                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
1994
1995             int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
1996             v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
1997             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
1998             v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
1999             int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2000             v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
2001
2002             v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
2003             v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
2004             v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2005
2006             int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2007             v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
2008             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2009             v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
2010             int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2011             v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
2012
2013             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
2014             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
2015             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
2016
2017             if (dcn == 3)
2018             {
2019                 uint16x8x3_t v_dst;
2020                 v_dst.val[bidx] = v_b;
2021                 v_dst.val[1] = v_g;
2022                 v_dst.val[bidx^2] = v_r;
2023                 vst3q_u16(dst, v_dst);
2024             }
2025             else
2026             {
2027                 uint16x8x4_t v_dst;
2028                 v_dst.val[bidx] = v_b;
2029                 v_dst.val[1] = v_g;
2030                 v_dst.val[bidx^2] = v_r;
2031                 v_dst.val[3] = v_alpha;
2032                 vst4q_u16(dst, v_dst);
2033             }
2034         }
2035
2036         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
2037         {
2038             uint16x4x3_t v_src = vld3_u16(src + i);
2039
2040             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
2041                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
2042                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2043
2044             int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2045             v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
2046             int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2047             v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
2048             int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
2049             v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
2050
2051             uint16x4_t v_bd = vqmovun_s32(v_b);
2052             uint16x4_t v_gd = vqmovun_s32(v_g);
2053             uint16x4_t v_rd = vqmovun_s32(v_r);
2054
2055             if (dcn == 3)
2056             {
2057                 uint16x4x3_t v_dst;
2058                 v_dst.val[bidx] = v_bd;
2059                 v_dst.val[1] = v_gd;
2060                 v_dst.val[bidx^2] = v_rd;
2061                 vst3_u16(dst, v_dst);
2062             }
2063             else
2064             {
2065                 uint16x4x4_t v_dst;
2066                 v_dst.val[bidx] = v_bd;
2067                 v_dst.val[1] = v_gd;
2068                 v_dst.val[bidx^2] = v_rd;
2069                 v_dst.val[3] = v_alpha2;
2070                 vst4_u16(dst, v_dst);
2071             }
2072         }
2073
2074         for ( ; i < n; i += 3, dst += dcn)
2075         {
2076             ushort Y = src[i];
2077             ushort Cr = src[i+1];
2078             ushort Cb = src[i+2];
2079
2080             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2081             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2082             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2083
2084             dst[bidx] = saturate_cast<ushort>(b);
2085             dst[1] = saturate_cast<ushort>(g);
2086             dst[bidx^2] = saturate_cast<ushort>(r);
2087             if( dcn == 4 )
2088                 dst[3] = alpha;
2089         }
2090     }
2091     int dstcn, blueIdx;
2092     int coeffs[4];
2093
2094     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
2095     uint16x8_t v_alpha;
2096     uint16x4_t v_alpha2;
2097 };
2098
2099 #endif
2100
2101 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
2102
2103 static const float sRGB2XYZ_D65[] =
2104 {
2105     0.412453f, 0.357580f, 0.180423f,
2106     0.212671f, 0.715160f, 0.072169f,
2107     0.019334f, 0.119193f, 0.950227f
2108 };
2109
2110 static const float XYZ2sRGB_D65[] =
2111 {
2112     3.240479f, -1.53715f, -0.498535f,
2113     -0.969256f, 1.875991f, 0.041556f,
2114     0.055648f, -0.204043f, 1.057311f
2115 };
2116
2117 template<typename _Tp> struct RGB2XYZ_f
2118 {
2119     typedef _Tp channel_type;
2120
2121     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2122     {
2123         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
2124         if(blueIdx == 0)
2125         {
2126             std::swap(coeffs[0], coeffs[2]);
2127             std::swap(coeffs[3], coeffs[5]);
2128             std::swap(coeffs[6], coeffs[8]);
2129         }
2130     }
2131     void operator()(const _Tp* src, _Tp* dst, int n) const
2132     {
2133         int scn = srccn;
2134         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2135               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2136               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2137
2138         n *= 3;
2139         for(int i = 0; i < n; i += 3, src += scn)
2140         {
2141             _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
2142             _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
2143             _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
2144             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
2145         }
2146     }
2147     int srccn;
2148     float coeffs[9];
2149 };
2150
2151 #if CV_NEON
2152
2153 template <>
2154 struct RGB2XYZ_f<float>
2155 {
2156     typedef float channel_type;
2157
2158     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2159     {
2160         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
2161         if(blueIdx == 0)
2162         {
2163             std::swap(coeffs[0], coeffs[2]);
2164             std::swap(coeffs[3], coeffs[5]);
2165             std::swap(coeffs[6], coeffs[8]);
2166         }
2167
2168         v_c0 = vdupq_n_f32(coeffs[0]);
2169         v_c1 = vdupq_n_f32(coeffs[1]);
2170         v_c2 = vdupq_n_f32(coeffs[2]);
2171         v_c3 = vdupq_n_f32(coeffs[3]);
2172         v_c4 = vdupq_n_f32(coeffs[4]);
2173         v_c5 = vdupq_n_f32(coeffs[5]);
2174         v_c6 = vdupq_n_f32(coeffs[6]);
2175         v_c7 = vdupq_n_f32(coeffs[7]);
2176         v_c8 = vdupq_n_f32(coeffs[8]);
2177     }
2178
2179     void operator()(const float* src, float* dst, int n) const
2180     {
2181         int scn = srccn, i = 0;
2182         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2183               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2184               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2185
2186         n *= 3;
2187
2188         if (scn == 3)
2189             for ( ; i <= n - 12; i += 12, src += 12)
2190             {
2191                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
2192                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
2193                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
2194                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
2195                 vst3q_f32(dst + i, v_dst);
2196             }
2197         else
2198             for ( ; i <= n - 12; i += 12, src += 16)
2199             {
2200                 float32x4x4_t v_src = vld4q_f32(src);
2201                 float32x4x3_t v_dst;
2202                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
2203                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
2204                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
2205                 vst3q_f32(dst + i, v_dst);
2206             }
2207
2208         for ( ; i < n; i += 3, src += scn)
2209         {
2210             float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
2211             float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
2212             float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
2213             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
2214         }
2215     }
2216
2217     int srccn;
2218     float coeffs[9];
2219     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2220 };
2221
2222 #endif
2223
2224 template<typename _Tp> struct RGB2XYZ_i
2225 {
2226     typedef _Tp channel_type;
2227
2228     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2229     {
2230         static const int coeffs0[] =
2231         {
2232             1689,    1465,    739,
2233             871,     2929,    296,
2234             79,      488,     3892
2235         };
2236         for( int i = 0; i < 9; i++ )
2237             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2238         if(blueIdx == 0)
2239         {
2240             std::swap(coeffs[0], coeffs[2]);
2241             std::swap(coeffs[3], coeffs[5]);
2242             std::swap(coeffs[6], coeffs[8]);
2243         }
2244     }
2245     void operator()(const _Tp* src, _Tp* dst, int n) const
2246     {
2247         int scn = srccn;
2248         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2249             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2250             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2251         n *= 3;
2252         for(int i = 0; i < n; i += 3, src += scn)
2253         {
2254             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
2255             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
2256             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
2257             dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
2258             dst[i+2] = saturate_cast<_Tp>(Z);
2259         }
2260     }
2261     int srccn;
2262     int coeffs[9];
2263 };
2264
2265 #if CV_NEON
2266
2267 template <>
2268 struct RGB2XYZ_i<uchar>
2269 {
2270     typedef uchar channel_type;
2271
2272     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2273     {
2274         static const int coeffs0[] =
2275         {
2276             1689,    1465,    739,
2277             871,     2929,    296,
2278             79,      488,     3892
2279         };
2280         for( int i = 0; i < 9; i++ )
2281             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2282         if(blueIdx == 0)
2283         {
2284             std::swap(coeffs[0], coeffs[2]);
2285             std::swap(coeffs[3], coeffs[5]);
2286             std::swap(coeffs[6], coeffs[8]);
2287         }
2288
2289         v_c0 = vdup_n_u16(coeffs[0]);
2290         v_c1 = vdup_n_u16(coeffs[1]);
2291         v_c2 = vdup_n_u16(coeffs[2]);
2292         v_c3 = vdup_n_u16(coeffs[3]);
2293         v_c4 = vdup_n_u16(coeffs[4]);
2294         v_c5 = vdup_n_u16(coeffs[5]);
2295         v_c6 = vdup_n_u16(coeffs[6]);
2296         v_c7 = vdup_n_u16(coeffs[7]);
2297         v_c8 = vdup_n_u16(coeffs[8]);
2298         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
2299     }
2300     void operator()(const uchar * src, uchar * dst, int n) const
2301     {
2302         int scn = srccn, i = 0;
2303         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2304             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2305             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2306         n *= 3;
2307
2308         for ( ; i <= n - 24; i += 24, src += scn * 8)
2309         {
2310             uint8x8x3_t v_dst;
2311             uint16x8x3_t v_src16;
2312
2313             if (scn == 3)
2314             {
2315                 uint8x8x3_t v_src = vld3_u8(src);
2316                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
2317                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
2318                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
2319             }
2320             else
2321             {
2322                 uint8x8x4_t v_src = vld4_u8(src);
2323                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
2324                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
2325                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
2326             }
2327
2328             uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
2329                        v_s1 = vget_low_u16(v_src16.val[1]),
2330                        v_s2 = vget_low_u16(v_src16.val[2]);
2331
2332             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2333             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2334             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2335             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
2336             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
2337             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
2338
2339             v_s0 = vget_high_u16(v_src16.val[0]),
2340             v_s1 = vget_high_u16(v_src16.val[1]),
2341             v_s2 = vget_high_u16(v_src16.val[2]);
2342
2343             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2344             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2345             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2346             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
2347             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
2348             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
2349
2350             v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
2351             v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
2352             v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
2353
2354             vst3_u8(dst + i, v_dst);
2355         }
2356
2357         for ( ; i < n; i += 3, src += scn)
2358         {
2359             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
2360             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
2361             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
2362             dst[i] = saturate_cast<uchar>(X);
2363             dst[i+1] = saturate_cast<uchar>(Y);
2364             dst[i+2] = saturate_cast<uchar>(Z);
2365         }
2366     }
2367
2368     int srccn, coeffs[9];
2369     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2370     uint32x4_t v_delta;
2371 };
2372
2373 template <>
2374 struct RGB2XYZ_i<ushort>
2375 {
2376     typedef ushort channel_type;
2377
2378     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
2379     {
2380         static const int coeffs0[] =
2381         {
2382             1689,    1465,    739,
2383             871,     2929,    296,
2384             79,      488,     3892
2385         };
2386         for( int i = 0; i < 9; i++ )
2387             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2388         if(blueIdx == 0)
2389         {
2390             std::swap(coeffs[0], coeffs[2]);
2391             std::swap(coeffs[3], coeffs[5]);
2392             std::swap(coeffs[6], coeffs[8]);
2393         }
2394
2395         v_c0 = vdup_n_u16(coeffs[0]);
2396         v_c1 = vdup_n_u16(coeffs[1]);
2397         v_c2 = vdup_n_u16(coeffs[2]);
2398         v_c3 = vdup_n_u16(coeffs[3]);
2399         v_c4 = vdup_n_u16(coeffs[4]);
2400         v_c5 = vdup_n_u16(coeffs[5]);
2401         v_c6 = vdup_n_u16(coeffs[6]);
2402         v_c7 = vdup_n_u16(coeffs[7]);
2403         v_c8 = vdup_n_u16(coeffs[8]);
2404         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
2405     }
2406
2407     void operator()(const ushort * src, ushort * dst, int n) const
2408     {
2409         int scn = srccn, i = 0;
2410         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2411             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2412             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2413         n *= 3;
2414
2415         for ( ; i <= n - 24; i += 24, src += scn * 8)
2416         {
2417             uint16x8x3_t v_src, v_dst;
2418
2419             if (scn == 3)
2420                 v_src = vld3q_u16(src);
2421             else
2422             {
2423                 uint16x8x4_t v_src4 = vld4q_u16(src);
2424                 v_src.val[0] = v_src4.val[0];
2425                 v_src.val[1] = v_src4.val[1];
2426                 v_src.val[2] = v_src4.val[2];
2427             }
2428
2429             uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
2430                        v_s1 = vget_low_u16(v_src.val[1]),
2431                        v_s2 = vget_low_u16(v_src.val[2]);
2432
2433             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2434             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2435             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2436             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
2437             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
2438             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
2439
2440             v_s0 = vget_high_u16(v_src.val[0]),
2441             v_s1 = vget_high_u16(v_src.val[1]),
2442             v_s2 = vget_high_u16(v_src.val[2]);
2443
2444             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2445             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2446             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2447             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
2448             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
2449             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
2450
2451             v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
2452             v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
2453             v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
2454
2455             vst3q_u16(dst + i, v_dst);
2456         }
2457
2458         for ( ; i <= n - 12; i += 12, src += scn * 4)
2459         {
2460             uint16x4x3_t v_dst;
2461             uint16x4_t v_s0, v_s1, v_s2;
2462
2463             if (scn == 3)
2464             {
2465                 uint16x4x3_t v_src = vld3_u16(src);
2466                 v_s0 = v_src.val[0];
2467                 v_s1 = v_src.val[1];
2468                 v_s2 = v_src.val[2];
2469             }
2470             else
2471             {
2472                 uint16x4x4_t v_src = vld4_u16(src);
2473                 v_s0 = v_src.val[0];
2474                 v_s1 = v_src.val[1];
2475                 v_s2 = v_src.val[2];
2476             }
2477
2478             uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2479             uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2480             uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2481
2482             v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
2483             v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
2484             v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
2485
2486             vst3_u16(dst + i, v_dst);
2487         }
2488
2489         for ( ; i < n; i += 3, src += scn)
2490         {
2491             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
2492             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
2493             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
2494             dst[i] = saturate_cast<ushort>(X);
2495             dst[i+1] = saturate_cast<ushort>(Y);
2496             dst[i+2] = saturate_cast<ushort>(Z);
2497         }
2498     }
2499
2500     int srccn, coeffs[9];
2501     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2502     uint32x4_t v_delta;
2503 };
2504
2505 #endif
2506
2507 template<typename _Tp> struct XYZ2RGB_f
2508 {
2509     typedef _Tp channel_type;
2510
2511     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2512     : dstcn(_dstcn), blueIdx(_blueIdx)
2513     {
2514         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
2515         if(blueIdx == 0)
2516         {
2517             std::swap(coeffs[0], coeffs[6]);
2518             std::swap(coeffs[1], coeffs[7]);
2519             std::swap(coeffs[2], coeffs[8]);
2520         }
2521     }
2522
2523     void operator()(const _Tp* src, _Tp* dst, int n) const
2524     {
2525         int dcn = dstcn;
2526         _Tp alpha = ColorChannel<_Tp>::max();
2527         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2528               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2529               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2530         n *= 3;
2531         for(int i = 0; i < n; i += 3, dst += dcn)
2532         {
2533             _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
2534             _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
2535             _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
2536             dst[0] = B; dst[1] = G; dst[2] = R;
2537             if( dcn == 4 )
2538                 dst[3] = alpha;
2539         }
2540     }
2541     int dstcn, blueIdx;
2542     float coeffs[9];
2543 };
2544
2545
2546 template<typename _Tp> struct XYZ2RGB_i
2547 {
2548     typedef _Tp channel_type;
2549
2550     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2551     : dstcn(_dstcn), blueIdx(_blueIdx)
2552     {
2553         static const int coeffs0[] =
2554         {
2555             13273,  -6296,  -2042,
2556             -3970,   7684,    170,
2557               228,   -836,   4331
2558         };
2559         for(int i = 0; i < 9; i++)
2560             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2561
2562         if(blueIdx == 0)
2563         {
2564             std::swap(coeffs[0], coeffs[6]);
2565             std::swap(coeffs[1], coeffs[7]);
2566             std::swap(coeffs[2], coeffs[8]);
2567         }
2568     }
2569     void operator()(const _Tp* src, _Tp* dst, int n) const
2570     {
2571         int dcn = dstcn;
2572         _Tp alpha = ColorChannel<_Tp>::max();
2573         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2574             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2575             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2576         n *= 3;
2577         for(int i = 0; i < n; i += 3, dst += dcn)
2578         {
2579             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
2580             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
2581             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
2582             dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
2583             dst[2] = saturate_cast<_Tp>(R);
2584             if( dcn == 4 )
2585                 dst[3] = alpha;
2586         }
2587     }
2588     int dstcn, blueIdx;
2589     int coeffs[9];
2590 };
2591
2592 #if CV_NEON
2593
2594 template <>
2595 struct XYZ2RGB_i<uchar>
2596 {
2597     typedef uchar channel_type;
2598
2599     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2600     : dstcn(_dstcn), blueIdx(_blueIdx)
2601     {
2602         static const int coeffs0[] =
2603         {
2604             13273,  -6296,  -2042,
2605             -3970,   7684,    170,
2606               228,   -836,   4331
2607         };
2608         for(int i = 0; i < 9; i++)
2609             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2610
2611         if(blueIdx == 0)
2612         {
2613             std::swap(coeffs[0], coeffs[6]);
2614             std::swap(coeffs[1], coeffs[7]);
2615             std::swap(coeffs[2], coeffs[8]);
2616         }
2617
2618         v_c0 = vdup_n_s16(coeffs[0]);
2619         v_c1 = vdup_n_s16(coeffs[1]);
2620         v_c2 = vdup_n_s16(coeffs[2]);
2621         v_c3 = vdup_n_s16(coeffs[3]);
2622         v_c4 = vdup_n_s16(coeffs[4]);
2623         v_c5 = vdup_n_s16(coeffs[5]);
2624         v_c6 = vdup_n_s16(coeffs[6]);
2625         v_c7 = vdup_n_s16(coeffs[7]);
2626         v_c8 = vdup_n_s16(coeffs[8]);
2627         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
2628         v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
2629     }
2630
2631     void operator()(const uchar* src, uchar* dst, int n) const
2632     {
2633         int dcn = dstcn, i = 0;
2634         uchar alpha = ColorChannel<uchar>::max();
2635         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2636             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2637             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2638         n *= 3;
2639
2640         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2641         {
2642             uint8x8x3_t v_src = vld3_u8(src + i);
2643             int16x8x3_t v_src16;
2644             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
2645             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
2646             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
2647
2648             int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
2649                        v_s1 = vget_low_s16(v_src16.val[1]),
2650                        v_s2 = vget_low_s16(v_src16.val[2]);
2651
2652             int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2653             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2654             int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2655             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
2656             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
2657             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
2658
2659             v_s0 = vget_high_s16(v_src16.val[0]),
2660             v_s1 = vget_high_s16(v_src16.val[1]),
2661             v_s2 = vget_high_s16(v_src16.val[2]);
2662
2663             int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2664             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2665             int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2666             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
2667             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
2668             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
2669
2670             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
2671             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
2672             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
2673
2674             if (dcn == 3)
2675             {
2676                 uint8x8x3_t v_dst;
2677                 v_dst.val[0] = v_b;
2678                 v_dst.val[1] = v_g;
2679                 v_dst.val[2] = v_r;
2680                 vst3_u8(dst, v_dst);
2681             }
2682             else
2683             {
2684                 uint8x8x4_t v_dst;
2685                 v_dst.val[0] = v_b;
2686                 v_dst.val[1] = v_g;
2687                 v_dst.val[2] = v_r;
2688                 v_dst.val[3] = v_alpha;
2689                 vst4_u8(dst, v_dst);
2690             }
2691         }
2692
2693         for ( ; i < n; i += 3, dst += dcn)
2694         {
2695             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
2696             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
2697             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
2698             dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
2699             dst[2] = saturate_cast<uchar>(R);
2700             if( dcn == 4 )
2701                 dst[3] = alpha;
2702         }
2703     }
2704     int dstcn, blueIdx;
2705     int coeffs[9];
2706
2707     int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
2708     uint8x8_t v_alpha;
2709     int32x4_t v_delta;
2710 };
2711
2712 template <>
2713 struct XYZ2RGB_i<ushort>
2714 {
2715     typedef ushort channel_type;
2716
2717     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2718     : dstcn(_dstcn), blueIdx(_blueIdx)
2719     {
2720         static const int coeffs0[] =
2721         {
2722             13273,  -6296,  -2042,
2723             -3970,   7684,    170,
2724               228,   -836,   4331
2725         };
2726         for(int i = 0; i < 9; i++)
2727             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
2728
2729         if(blueIdx == 0)
2730         {
2731             std::swap(coeffs[0], coeffs[6]);
2732             std::swap(coeffs[1], coeffs[7]);
2733             std::swap(coeffs[2], coeffs[8]);
2734         }
2735
2736         v_c0 = vdupq_n_s32(coeffs[0]);
2737         v_c1 = vdupq_n_s32(coeffs[1]);
2738         v_c2 = vdupq_n_s32(coeffs[2]);
2739         v_c3 = vdupq_n_s32(coeffs[3]);
2740         v_c4 = vdupq_n_s32(coeffs[4]);
2741         v_c5 = vdupq_n_s32(coeffs[5]);
2742         v_c6 = vdupq_n_s32(coeffs[6]);
2743         v_c7 = vdupq_n_s32(coeffs[7]);
2744         v_c8 = vdupq_n_s32(coeffs[8]);
2745         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
2746         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
2747         v_alpha2 = vget_low_u16(v_alpha);
2748     }
2749
2750     void operator()(const ushort* src, ushort* dst, int n) const
2751     {
2752         int dcn = dstcn, i = 0;
2753         ushort alpha = ColorChannel<ushort>::max();
2754         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
2755             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
2756             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
2757         n *= 3;
2758
2759         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2760         {
2761             uint16x8x3_t v_src = vld3q_u16(src + i);
2762             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
2763                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
2764                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
2765
2766             int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2767             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2768             int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2769             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
2770             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
2771             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
2772
2773             v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
2774             v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
2775             v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2776
2777             int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2778             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2779             int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2780             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
2781             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
2782             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
2783
2784             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
2785             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
2786             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
2787
2788             if (dcn == 3)
2789             {
2790                 uint16x8x3_t v_dst;
2791                 v_dst.val[0] = v_b;
2792                 v_dst.val[1] = v_g;
2793                 v_dst.val[2] = v_r;
2794                 vst3q_u16(dst, v_dst);
2795             }
2796             else
2797             {
2798                 uint16x8x4_t v_dst;
2799                 v_dst.val[0] = v_b;
2800                 v_dst.val[1] = v_g;
2801                 v_dst.val[2] = v_r;
2802                 v_dst.val[3] = v_alpha;
2803                 vst4q_u16(dst, v_dst);
2804             }
2805         }
2806
2807         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
2808         {
2809             uint16x4x3_t v_src = vld3_u16(src + i);
2810             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
2811                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
2812                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2813
2814             int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
2815             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
2816             int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
2817             v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
2818             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
2819             v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
2820
2821             uint16x4_t v_b = vqmovun_s32(v_X);
2822             uint16x4_t v_g = vqmovun_s32(v_Y);
2823             uint16x4_t v_r = vqmovun_s32(v_Z);
2824
2825             if (dcn == 3)
2826             {
2827                 uint16x4x3_t v_dst;
2828                 v_dst.val[0] = v_b;
2829                 v_dst.val[1] = v_g;
2830                 v_dst.val[2] = v_r;
2831                 vst3_u16(dst, v_dst);
2832             }
2833             else
2834             {
2835                 uint16x4x4_t v_dst;
2836                 v_dst.val[0] = v_b;
2837                 v_dst.val[1] = v_g;
2838                 v_dst.val[2] = v_r;
2839                 v_dst.val[3] = v_alpha2;
2840                 vst4_u16(dst, v_dst);
2841             }
2842         }
2843
2844         for ( ; i < n; i += 3, dst += dcn)
2845         {
2846             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
2847             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
2848             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
2849             dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
2850             dst[2] = saturate_cast<ushort>(R);
2851             if( dcn == 4 )
2852                 dst[3] = alpha;
2853         }
2854     }
2855     int dstcn, blueIdx;
2856     int coeffs[9];
2857
2858     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
2859     uint16x4_t v_alpha2;
2860     uint16x8_t v_alpha;
2861 };
2862
2863 #endif
2864
2865 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
2866
2867
2868 struct RGB2HSV_b
2869 {
2870     typedef uchar channel_type;
2871
2872     RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
2873     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
2874     {
2875         CV_Assert( hrange == 180 || hrange == 256 );
2876     }
2877
2878     void operator()(const uchar* src, uchar* dst, int n) const
2879     {
2880         int i, bidx = blueIdx, scn = srccn;
2881         const int hsv_shift = 12;
2882
2883         static int sdiv_table[256];
2884         static int hdiv_table180[256];
2885         static int hdiv_table256[256];
2886         static volatile bool initialized = false;
2887
2888         int hr = hrange;
2889         const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
2890         n *= 3;
2891
2892         if( !initialized )
2893         {
2894             sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
2895             for( i = 1; i < 256; i++ )
2896             {
2897                 sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
2898                 hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
2899                 hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
2900             }
2901             initialized = true;
2902         }
2903
2904         for( i = 0; i < n; i += 3, src += scn )
2905         {
2906             int b = src[bidx], g = src[1], r = src[bidx^2];
2907             int h, s, v = b;
2908             int vmin = b, diff;
2909             int vr, vg;
2910
2911             CV_CALC_MAX_8U( v, g );
2912             CV_CALC_MAX_8U( v, r );
2913             CV_CALC_MIN_8U( vmin, g );
2914             CV_CALC_MIN_8U( vmin, r );
2915
2916             diff = v - vmin;
2917             vr = v == r ? -1 : 0;
2918             vg = v == g ? -1 : 0;
2919
2920             s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
2921             h = (vr & (g - b)) +
2922                 (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
2923             h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
2924             h += h < 0 ? hr : 0;
2925
2926             dst[i] = saturate_cast<uchar>(h);
2927             dst[i+1] = (uchar)s;
2928             dst[i+2] = (uchar)v;
2929         }
2930     }
2931
2932     int srccn, blueIdx, hrange;
2933 };
2934
2935
2936 struct RGB2HSV_f
2937 {
2938     typedef float channel_type;
2939
2940     RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
2941     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
2942
2943     void operator()(const float* src, float* dst, int n) const
2944     {
2945         int i, bidx = blueIdx, scn = srccn;
2946         float hscale = hrange*(1.f/360.f);
2947         n *= 3;
2948
2949         for( i = 0; i < n; i += 3, src += scn )
2950         {
2951             float b = src[bidx], g = src[1], r = src[bidx^2];
2952             float h, s, v;
2953
2954             float vmin, diff;
2955
2956             v = vmin = r;
2957             if( v < g ) v = g;
2958             if( v < b ) v = b;
2959             if( vmin > g ) vmin = g;
2960             if( vmin > b ) vmin = b;
2961
2962             diff = v - vmin;
2963             s = diff/(float)(fabs(v) + FLT_EPSILON);
2964             diff = (float)(60./(diff + FLT_EPSILON));
2965             if( v == r )
2966                 h = (g - b)*diff;
2967             else if( v == g )
2968                 h = (b - r)*diff + 120.f;
2969             else
2970                 h = (r - g)*diff + 240.f;
2971
2972             if( h < 0 ) h += 360.f;
2973
2974             dst[i] = h*hscale;
2975             dst[i+1] = s;
2976             dst[i+2] = v;
2977         }
2978     }
2979
2980     int srccn, blueIdx;
2981     float hrange;
2982 };
2983
2984
2985 struct HSV2RGB_f
2986 {
2987     typedef float channel_type;
2988
2989     HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
2990     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
2991
2992     void operator()(const float* src, float* dst, int n) const
2993     {
2994         int i, bidx = blueIdx, dcn = dstcn;
2995         float _hscale = hscale;
2996         float alpha = ColorChannel<float>::max();
2997         n *= 3;
2998
2999         for( i = 0; i < n; i += 3, dst += dcn )
3000         {
3001             float h = src[i], s = src[i+1], v = src[i+2];
3002             float b, g, r;
3003
3004             if( s == 0 )
3005                 b = g = r = v;
3006             else
3007             {
3008                 static const int sector_data[][3]=
3009                     {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
3010                 float tab[4];
3011                 int sector;
3012                 h *= _hscale;
3013                 if( h < 0 )
3014                     do h += 6; while( h < 0 );
3015                 else if( h >= 6 )
3016                     do h -= 6; while( h >= 6 );
3017                 sector = cvFloor(h);
3018                 h -= sector;
3019                 if( (unsigned)sector >= 6u )
3020                 {
3021                     sector = 0;
3022                     h = 0.f;
3023                 }
3024
3025                 tab[0] = v;
3026                 tab[1] = v*(1.f - s);
3027                 tab[2] = v*(1.f - s*h);
3028                 tab[3] = v*(1.f - s*(1.f - h));
3029
3030                 b = tab[sector_data[sector][0]];
3031                 g = tab[sector_data[sector][1]];
3032                 r = tab[sector_data[sector][2]];
3033             }
3034
3035             dst[bidx] = b;
3036             dst[1] = g;
3037             dst[bidx^2] = r;
3038             if( dcn == 4 )
3039                 dst[3] = alpha;
3040         }
3041     }
3042
3043     int dstcn, blueIdx;
3044     float hscale;
3045 };
3046
3047
3048 struct HSV2RGB_b
3049 {
3050     typedef uchar channel_type;
3051
3052     HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
3053     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
3054     {
3055         #if CV_NEON
3056         v_scale_inv = vdupq_n_f32(1.f/255.f);
3057         v_scale = vdupq_n_f32(255.f);
3058         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3059         #endif
3060     }
3061
3062     void operator()(const uchar* src, uchar* dst, int n) const
3063     {
3064         int i, j, dcn = dstcn;
3065         uchar alpha = ColorChannel<uchar>::max();
3066         float buf[3*BLOCK_SIZE];
3067
3068         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
3069         {
3070             int dn = std::min(n - i, (int)BLOCK_SIZE);
3071             j = 0;
3072
3073             #if CV_NEON
3074             for ( ; j <= (dn - 8) * 3; j += 24)
3075             {
3076                 uint8x8x3_t v_src = vld3_u8(src + j);
3077                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
3078                            v_t1 = vmovl_u8(v_src.val[1]),
3079                            v_t2 = vmovl_u8(v_src.val[2]);
3080
3081                 float32x4x3_t v_dst;
3082                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
3083                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
3084                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
3085                 vst3q_f32(buf + j, v_dst);
3086
3087                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
3088                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
3089                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
3090                 vst3q_f32(buf + j + 12, v_dst);
3091             }
3092             #endif
3093
3094             for( ; j < dn*3; j += 3 )
3095             {
3096                 buf[j] = src[j];
3097                 buf[j+1] = src[j+1]*(1.f/255.f);
3098                 buf[j+2] = src[j+2]*(1.f/255.f);
3099             }
3100             cvt(buf, buf, dn);
3101
3102             j = 0;
3103             #if CV_NEON
3104             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
3105             {
3106                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3107                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
3108                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
3109                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3110                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3111                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3112                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3113
3114                 if (dcn == 4)
3115                 {
3116                     uint8x8x4_t v_dst;
3117                     v_dst.val[0] = v_dst0;
3118                     v_dst.val[1] = v_dst1;
3119                     v_dst.val[2] = v_dst2;
3120                     v_dst.val[3] = v_alpha;
3121                     vst4_u8(dst, v_dst);
3122                 }
3123                 else
3124                 {
3125                     uint8x8x3_t v_dst;
3126                     v_dst.val[0] = v_dst0;
3127                     v_dst.val[1] = v_dst1;
3128                     v_dst.val[2] = v_dst2;
3129                     vst3_u8(dst, v_dst);
3130                 }
3131             }
3132             #endif
3133
3134             for( ; j < dn*3; j += 3, dst += dcn )
3135             {
3136                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
3137                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
3138                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
3139                 if( dcn == 4 )
3140                     dst[3] = alpha;
3141             }
3142         }
3143     }
3144
3145     int dstcn;
3146     HSV2RGB_f cvt;
3147     #if CV_NEON
3148     float32x4_t v_scale, v_scale_inv;
3149     uint8x8_t v_alpha;
3150     #endif
3151 };
3152
3153
3154 ///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
3155
3156 struct RGB2HLS_f
3157 {
3158     typedef float channel_type;
3159
3160     RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
3161     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
3162
3163     void operator()(const float* src, float* dst, int n) const
3164     {
3165         int i, bidx = blueIdx, scn = srccn;
3166         float hscale = hrange*(1.f/360.f);
3167         n *= 3;
3168
3169         for( i = 0; i < n; i += 3, src += scn )
3170         {
3171             float b = src[bidx], g = src[1], r = src[bidx^2];
3172             float h = 0.f, s = 0.f, l;
3173             float vmin, vmax, diff;
3174
3175             vmax = vmin = r;
3176             if( vmax < g ) vmax = g;
3177             if( vmax < b ) vmax = b;
3178             if( vmin > g ) vmin = g;
3179             if( vmin > b ) vmin = b;
3180
3181             diff = vmax - vmin;
3182             l = (vmax + vmin)*0.5f;
3183
3184             if( diff > FLT_EPSILON )
3185             {
3186                 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
3187                 diff = 60.f/diff;
3188
3189                 if( vmax == r )
3190                     h = (g - b)*diff;
3191                 else if( vmax == g )
3192                     h = (b - r)*diff + 120.f;
3193                 else
3194                     h = (r - g)*diff + 240.f;
3195
3196                 if( h < 0.f ) h += 360.f;
3197             }
3198
3199             dst[i] = h*hscale;
3200             dst[i+1] = l;
3201             dst[i+2] = s;
3202         }
3203     }
3204
3205     int srccn, blueIdx;
3206     float hrange;
3207 };
3208
3209
3210 struct RGB2HLS_b
3211 {
3212     typedef uchar channel_type;
3213
3214     RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
3215     : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
3216     {
3217         #if CV_NEON
3218         v_scale_inv = vdupq_n_f32(1.f/255.f);
3219         v_scale = vdupq_n_f32(255.f);
3220         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3221         #endif
3222     }
3223
3224     void operator()(const uchar* src, uchar* dst, int n) const
3225     {
3226         int i, j, scn = srccn;
3227         float buf[3*BLOCK_SIZE];
3228
3229         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
3230         {
3231             int dn = std::min(n - i, (int)BLOCK_SIZE);
3232             j = 0;
3233
3234             #if CV_NEON
3235             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
3236             {
3237                 uint16x8_t v_t0, v_t1, v_t2;
3238
3239                 if (scn == 3)
3240                 {
3241                     uint8x8x3_t v_src = vld3_u8(src);
3242                     v_t0 = vmovl_u8(v_src.val[0]);
3243                     v_t1 = vmovl_u8(v_src.val[1]);
3244                     v_t2 = vmovl_u8(v_src.val[2]);
3245                 }
3246                 else
3247                 {
3248                     uint8x8x4_t v_src = vld4_u8(src);
3249                     v_t0 = vmovl_u8(v_src.val[0]);
3250                     v_t1 = vmovl_u8(v_src.val[1]);
3251                     v_t2 = vmovl_u8(v_src.val[2]);
3252                 }
3253
3254                 float32x4x3_t v_dst;
3255                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
3256                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
3257                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
3258                 vst3q_f32(buf + j, v_dst);
3259
3260                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
3261                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
3262                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
3263                 vst3q_f32(buf + j + 12, v_dst);
3264             }
3265             #endif
3266             for( ; j < dn*3; j += 3, src += scn )
3267             {
3268                 buf[j] = src[0]*(1.f/255.f);
3269                 buf[j+1] = src[1]*(1.f/255.f);
3270                 buf[j+2] = src[2]*(1.f/255.f);
3271             }
3272             cvt(buf, buf, dn);
3273
3274             j = 0;
3275             #if CV_NEON
3276             for ( ; j <= (dn - 8) * 3; j += 24)
3277             {
3278                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3279
3280                 uint8x8x3_t v_dst;
3281                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
3282                                                        vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
3283                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3284                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3285                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3286                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3287                 vst3_u8(dst + j, v_dst);
3288             }
3289             #endif
3290             for( ; j < dn*3; j += 3 )
3291             {
3292                 dst[j] = saturate_cast<uchar>(buf[j]);
3293                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
3294                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
3295             }
3296         }
3297     }
3298
3299     int srccn;
3300     RGB2HLS_f cvt;
3301     #if CV_NEON
3302     float32x4_t v_scale, v_scale_inv;
3303     uint8x8_t v_alpha;
3304     #endif
3305 };
3306
3307
3308 struct HLS2RGB_f
3309 {
3310     typedef float channel_type;
3311
3312     HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
3313     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
3314
3315     void operator()(const float* src, float* dst, int n) const
3316     {
3317         int i, bidx = blueIdx, dcn = dstcn;
3318         float _hscale = hscale;
3319         float alpha = ColorChannel<float>::max();
3320         n *= 3;
3321
3322         for( i = 0; i < n; i += 3, dst += dcn )
3323         {
3324             float h = src[i], l = src[i+1], s = src[i+2];
3325             float b, g, r;
3326
3327             if( s == 0 )
3328                 b = g = r = l;
3329             else
3330             {
3331                 static const int sector_data[][3]=
3332                 {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
3333                 float tab[4];
3334                 int sector;
3335
3336                 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
3337                 float p1 = 2*l - p2;
3338
3339                 h *= _hscale;
3340                 if( h < 0 )
3341                     do h += 6; while( h < 0 );
3342                 else if( h >= 6 )
3343                     do h -= 6; while( h >= 6 );
3344
3345                 assert( 0 <= h && h < 6 );
3346                 sector = cvFloor(h);
3347                 h -= sector;
3348
3349                 tab[0] = p2;
3350                 tab[1] = p1;
3351                 tab[2] = p1 + (p2 - p1)*(1-h);
3352                 tab[3] = p1 + (p2 - p1)*h;
3353
3354                 b = tab[sector_data[sector][0]];
3355                 g = tab[sector_data[sector][1]];
3356                 r = tab[sector_data[sector][2]];
3357             }
3358
3359             dst[bidx] = b;
3360             dst[1] = g;
3361             dst[bidx^2] = r;
3362             if( dcn == 4 )
3363                 dst[3] = alpha;
3364         }
3365     }
3366
3367     int dstcn, blueIdx;
3368     float hscale;
3369 };
3370
3371
3372 struct HLS2RGB_b
3373 {
3374     typedef uchar channel_type;
3375
3376     HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
3377     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
3378     {
3379         #if CV_NEON
3380         v_scale_inv = vdupq_n_f32(1.f/255.f);
3381         v_scale = vdupq_n_f32(255.f);
3382         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3383         #endif
3384     }
3385
3386     void operator()(const uchar* src, uchar* dst, int n) const
3387     {
3388         int i, j, dcn = dstcn;
3389         uchar alpha = ColorChannel<uchar>::max();
3390         float buf[3*BLOCK_SIZE];
3391
3392         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
3393         {
3394             int dn = std::min(n - i, (int)BLOCK_SIZE);
3395             j = 0;
3396
3397             #if CV_NEON
3398             for ( ; j <= (dn - 8) * 3; j += 24)
3399             {
3400                 uint8x8x3_t v_src = vld3_u8(src + j);
3401                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
3402                            v_t1 = vmovl_u8(v_src.val[1]),
3403                            v_t2 = vmovl_u8(v_src.val[2]);
3404
3405                 float32x4x3_t v_dst;
3406                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
3407                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
3408                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
3409                 vst3q_f32(buf + j, v_dst);
3410
3411                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
3412                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
3413                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
3414                 vst3q_f32(buf + j + 12, v_dst);
3415             }
3416             #endif
3417             for( ; j < dn*3; j += 3 )
3418             {
3419                 buf[j] = src[j];
3420                 buf[j+1] = src[j+1]*(1.f/255.f);
3421                 buf[j+2] = src[j+2]*(1.f/255.f);
3422             }
3423             cvt(buf, buf, dn);
3424
3425             j = 0;
3426             #if CV_NEON
3427             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
3428             {
3429                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3430                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
3431                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
3432                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3433                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3434                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3435                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3436
3437                 if (dcn == 4)
3438                 {
3439                     uint8x8x4_t v_dst;
3440                     v_dst.val[0] = v_dst0;
3441                     v_dst.val[1] = v_dst1;
3442                     v_dst.val[2] = v_dst2;
3443                     v_dst.val[3] = v_alpha;
3444                     vst4_u8(dst, v_dst);
3445                 }
3446                 else
3447                 {
3448                     uint8x8x3_t v_dst;
3449                     v_dst.val[0] = v_dst0;
3450                     v_dst.val[1] = v_dst1;
3451                     v_dst.val[2] = v_dst2;
3452                     vst3_u8(dst, v_dst);
3453                 }
3454             }
3455             #endif
3456             for( ; j < dn*3; j += 3, dst += dcn )
3457             {
3458                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
3459                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
3460                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
3461                 if( dcn == 4 )
3462                     dst[3] = alpha;
3463             }
3464         }
3465     }
3466
3467     int dstcn;
3468     HLS2RGB_f cvt;
3469     #if CV_NEON
3470     float32x4_t v_scale, v_scale_inv;
3471     uint8x8_t v_alpha;
3472     #endif
3473 };
3474
3475
3476 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
3477
3478 static const float D65[] = { 0.950456f, 1.f, 1.088754f };
3479
3480 enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };
3481 static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];
3482 static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
3483
3484 static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
3485 static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
3486
3487 static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
3488 #undef lab_shift
3489 #define lab_shift xyz_shift
3490 #define gamma_shift 3
3491 #define lab_shift2 (lab_shift + gamma_shift)
3492 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
3493 static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
3494
3495 static void initLabTabs()
3496 {
3497     static bool initialized = false;
3498     if(!initialized)
3499     {
3500         float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale;
3501         int i;
3502         for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
3503         {
3504             float x = i*scale;
3505             f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
3506         }
3507         splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
3508
3509         scale = 1.f/GammaTabScale;
3510         for(i = 0; i <= GAMMA_TAB_SIZE; i++)
3511         {
3512             float x = i*scale;
3513             g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4);
3514             ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055);
3515         }
3516         splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
3517         splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
3518
3519         for(i = 0; i < 256; i++)
3520         {
3521             float x = i*(1.f/255.f);
3522             sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4)));
3523             linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
3524         }
3525
3526         for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
3527         {
3528             float x = i*(1.f/(255.f*(1 << gamma_shift)));
3529             LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
3530         }
3531         initialized = true;
3532     }
3533 }
3534
3535 struct RGB2Lab_b
3536 {
3537     typedef uchar channel_type;
3538
3539     RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
3540               const float* _whitept, bool _srgb)
3541     : srccn(_srccn), srgb(_srgb)
3542     {
3543         static volatile int _3 = 3;
3544         initLabTabs();
3545
3546         if (!_coeffs)
3547             _coeffs = sRGB2XYZ_D65;
3548         if (!_whitept)
3549             _whitept = D65;
3550
3551         float scale[] =
3552         {
3553             (1 << lab_shift)/_whitept[0],
3554             (float)(1 << lab_shift),
3555             (1 << lab_shift)/_whitept[2]
3556         };
3557
3558         for( int i = 0; i < _3; i++ )
3559         {
3560             coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
3561             coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
3562             coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);
3563
3564             CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
3565                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
3566         }
3567     }
3568
3569     void operator()(const uchar* src, uchar* dst, int n) const
3570     {
3571         const int Lscale = (116*255+50)/100;
3572         const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
3573         const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
3574         int i, scn = srccn;
3575         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3576             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3577             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3578         n *= 3;
3579
3580         for( i = 0; i < n; i += 3, src += scn )
3581         {
3582             int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
3583             int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
3584             int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
3585             int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
3586
3587             int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
3588             int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
3589             int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
3590
3591             dst[i] = saturate_cast<uchar>(L);
3592             dst[i+1] = saturate_cast<uchar>(a);
3593             dst[i+2] = saturate_cast<uchar>(b);
3594         }
3595     }
3596
3597     int srccn;
3598     int coeffs[9];
3599     bool srgb;
3600 };
3601
3602
3603 #define clip(value) \
3604     value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value;
3605
3606 struct RGB2Lab_f
3607 {
3608     typedef float channel_type;
3609
3610     RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
3611               const float* _whitept, bool _srgb)
3612     : srccn(_srccn), srgb(_srgb)
3613     {
3614         volatile int _3 = 3;
3615         initLabTabs();
3616
3617         if (!_coeffs)
3618             _coeffs = sRGB2XYZ_D65;
3619         if (!_whitept)
3620             _whitept = D65;
3621
3622         float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
3623
3624         for( int i = 0; i < _3; i++ )
3625         {
3626             int j = i * 3;
3627             coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i];
3628             coeffs[j + 1] = _coeffs[j + 1] * scale[i];
3629             coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i];
3630
3631             CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
3632                        coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
3633         }
3634     }
3635
3636     void operator()(const float* src, float* dst, int n) const
3637     {
3638         int i, scn = srccn;
3639         float gscale = GammaTabScale;
3640         const float* gammaTab = srgb ? sRGBGammaTab : 0;
3641         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3642               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3643               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3644         n *= 3;
3645
3646         static const float _1_3 = 1.0f / 3.0f;
3647         static const float _a = 16.0f / 116.0f;
3648         for (i = 0; i < n; i += 3, src += scn )
3649         {
3650             float R = clip(src[0]);
3651             float G = clip(src[1]);
3652             float B = clip(src[2]);
3653
3654             if (gammaTab)
3655             {
3656                 R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
3657                 G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
3658                 B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
3659             }
3660             float X = R*C0 + G*C1 + B*C2;
3661             float Y = R*C3 + G*C4 + B*C5;
3662             float Z = R*C6 + G*C7 + B*C8;
3663
3664             float FX = X > 0.008856f ? std::pow(X, _1_3) : (7.787f * X + _a);
3665             float FY = Y > 0.008856f ? std::pow(Y, _1_3) : (7.787f * Y + _a);
3666             float FZ = Z > 0.008856f ? std::pow(Z, _1_3) : (7.787f * Z + _a);
3667
3668             float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
3669             float a = 500.f * (FX - FY);
3670             float b = 200.f * (FY - FZ);
3671
3672             dst[i] = L;
3673             dst[i + 1] = a;
3674             dst[i + 2] = b;
3675         }
3676     }
3677
3678     int srccn;
3679     float coeffs[9];
3680     bool srgb;
3681 };
3682
3683 struct Lab2RGB_f
3684 {
3685     typedef float channel_type;
3686
3687     Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
3688               const float* _whitept, bool _srgb )
3689     : dstcn(_dstcn), srgb(_srgb)
3690     {
3691         initLabTabs();
3692
3693         if(!_coeffs)
3694             _coeffs = XYZ2sRGB_D65;
3695         if(!_whitept)
3696             _whitept = D65;
3697
3698         for( int i = 0; i < 3; i++ )
3699         {
3700             coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
3701             coeffs[i+3] = _coeffs[i+3]*_whitept[i];
3702             coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
3703         }
3704     }
3705
3706     void operator()(const float* src, float* dst, int n) const
3707     {
3708         int i, dcn = dstcn;
3709         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
3710         float gscale = GammaTabScale;
3711         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3712         C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3713         C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3714         float alpha = ColorChannel<float>::max();
3715         n *= 3;
3716
3717         static const float lThresh = 0.008856f * 903.3f;
3718         static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
3719         for (i = 0; i < n; i += 3, dst += dcn)
3720         {
3721             float li = src[i];
3722             float ai = src[i + 1];
3723             float bi = src[i + 2];
3724
3725             float y, fy;
3726             if (li <= lThresh)
3727             {
3728                 y = li / 903.3f;
3729                 fy = 7.787f * y + 16.0f / 116.0f;
3730             }
3731             else
3732             {
3733                 fy = (li + 16.0f) / 116.0f;
3734                 y = fy * fy * fy;
3735             }
3736
3737             float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
3738
3739             for (int j = 0; j < 2; j++)
3740                 if (fxz[j] <= fThresh)
3741                     fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
3742                 else
3743                     fxz[j] = fxz[j] * fxz[j] * fxz[j];
3744
3745
3746             float x = fxz[0], z = fxz[1];
3747             float ro = C0 * x + C1 * y + C2 * z;
3748             float go = C3 * x + C4 * y + C5 * z;
3749             float bo = C6 * x + C7 * y + C8 * z;
3750             ro = clip(ro);
3751             go = clip(go);
3752             bo = clip(bo);
3753
3754             if (gammaTab)
3755             {
3756                 ro = splineInterpolate(ro * gscale, gammaTab, GAMMA_TAB_SIZE);
3757                 go = splineInterpolate(go * gscale, gammaTab, GAMMA_TAB_SIZE);
3758                 bo = splineInterpolate(bo * gscale, gammaTab, GAMMA_TAB_SIZE);
3759             }
3760
3761             dst[0] = ro, dst[1] = go, dst[2] = bo;
3762             if( dcn == 4 )
3763                 dst[3] = alpha;
3764         }
3765     }
3766
3767     int dstcn;
3768     float coeffs[9];
3769     bool srgb;
3770 };
3771
3772 #undef clip
3773
3774 struct Lab2RGB_b
3775 {
3776     typedef uchar channel_type;
3777
3778     Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
3779                const float* _whitept, bool _srgb )
3780     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
3781     {
3782         #if CV_NEON
3783         v_scale_inv = vdupq_n_f32(100.f/255.f);
3784         v_scale = vdupq_n_f32(255.f);
3785         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
3786         v_128 = vdupq_n_f32(128.0f);
3787         #endif
3788     }
3789
3790     void operator()(const uchar* src, uchar* dst, int n) const
3791     {
3792         int i, j, dcn = dstcn;
3793         uchar alpha = ColorChannel<uchar>::max();
3794         float buf[3*BLOCK_SIZE];
3795
3796         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
3797         {
3798             int dn = std::min(n - i, (int)BLOCK_SIZE);
3799             j = 0;
3800
3801             #if CV_NEON
3802             for ( ; j <= (dn - 8) * 3; j += 24)
3803             {
3804                 uint8x8x3_t v_src = vld3_u8(src + j);
3805                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
3806                            v_t1 = vmovl_u8(v_src.val[1]),
3807                            v_t2 = vmovl_u8(v_src.val[2]);
3808
3809                 float32x4x3_t v_dst;
3810                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
3811                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
3812                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
3813                 vst3q_f32(buf + j, v_dst);
3814
3815                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
3816                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
3817                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
3818                 vst3q_f32(buf + j + 12, v_dst);
3819             }
3820             #endif
3821
3822             for( ; j < dn*3; j += 3 )
3823             {
3824                 buf[j] = src[j]*(100.f/255.f);
3825                 buf[j+1] = (float)(src[j+1] - 128);
3826                 buf[j+2] = (float)(src[j+2] - 128);
3827             }
3828             cvt(buf, buf, dn);
3829             j = 0;
3830
3831             #if CV_NEON
3832             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
3833             {
3834                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
3835                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
3836                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
3837                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
3838                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
3839                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
3840                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
3841
3842                 if (dcn == 4)
3843                 {
3844                     uint8x8x4_t v_dst;
3845                     v_dst.val[0] = v_dst0;
3846                     v_dst.val[1] = v_dst1;
3847                     v_dst.val[2] = v_dst2;
3848                     v_dst.val[3] = v_alpha;
3849                     vst4_u8(dst, v_dst);
3850                 }
3851                 else
3852                 {
3853                     uint8x8x3_t v_dst;
3854                     v_dst.val[0] = v_dst0;
3855                     v_dst.val[1] = v_dst1;
3856                     v_dst.val[2] = v_dst2;
3857                     vst3_u8(dst, v_dst);
3858                 }
3859             }
3860             #endif
3861
3862             for( ; j < dn*3; j += 3, dst += dcn )
3863             {
3864                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
3865                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
3866                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
3867                 if( dcn == 4 )
3868                     dst[3] = alpha;
3869             }
3870         }
3871     }
3872
3873     int dstcn;
3874     Lab2RGB_f cvt;
3875
3876     #if CV_NEON
3877     float32x4_t v_scale, v_scale_inv, v_128;
3878     uint8x8_t v_alpha;
3879     #endif
3880 };
3881
3882
3883 ///////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
3884
3885 struct RGB2Luv_f
3886 {
3887     typedef float channel_type;
3888
3889     RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
3890                const float* whitept, bool _srgb )
3891     : srccn(_srccn), srgb(_srgb)
3892     {
3893         volatile int i;
3894         initLabTabs();
3895
3896         if(!_coeffs) _coeffs = sRGB2XYZ_D65;
3897         if(!whitept) whitept = D65;
3898
3899         for( i = 0; i < 3; i++ )
3900         {
3901             coeffs[i*3] = _coeffs[i*3];
3902             coeffs[i*3+1] = _coeffs[i*3+1];
3903             coeffs[i*3+2] = _coeffs[i*3+2];
3904             if( blueIdx == 0 )
3905                 std::swap(coeffs[i*3], coeffs[i*3+2]);
3906             CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
3907                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
3908         }
3909
3910         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
3911         un = 4*whitept[0]*d;
3912         vn = 9*whitept[1]*d;
3913
3914         CV_Assert(whitept[1] == 1.f);
3915     }
3916
3917     void operator()(const float* src, float* dst, int n) const
3918     {
3919         int i, scn = srccn;
3920         float gscale = GammaTabScale;
3921         const float* gammaTab = srgb ? sRGBGammaTab : 0;
3922         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3923               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3924               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3925         float _un = 13*un, _vn = 13*vn;
3926         n *= 3;
3927
3928         for( i = 0; i < n; i += 3, src += scn )
3929         {
3930             float R = src[0], G = src[1], B = src[2];
3931             if( gammaTab )
3932             {
3933                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
3934                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
3935                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
3936             }
3937
3938             float X = R*C0 + G*C1 + B*C2;
3939             float Y = R*C3 + G*C4 + B*C5;
3940             float Z = R*C6 + G*C7 + B*C8;
3941
3942             float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
3943             L = 116.f*L - 16.f;
3944
3945             float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
3946             float u = L*(X*d - _un);
3947             float v = L*((9*0.25f)*Y*d - _vn);
3948
3949             dst[i] = L; dst[i+1] = u; dst[i+2] = v;
3950         }
3951     }
3952
3953     int srccn;
3954     float coeffs[9], un, vn;
3955     bool srgb;
3956 };
3957
3958
3959 struct Luv2RGB_f
3960 {
3961     typedef float channel_type;
3962
3963     Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
3964               const float* whitept, bool _srgb )
3965     : dstcn(_dstcn), srgb(_srgb)
3966     {
3967         initLabTabs();
3968
3969         if(!_coeffs) _coeffs = XYZ2sRGB_D65;
3970         if(!whitept) whitept = D65;
3971
3972         for( int i = 0; i < 3; i++ )
3973         {
3974             coeffs[i+(blueIdx^2)*3] = _coeffs[i];
3975             coeffs[i+3] = _coeffs[i+3];
3976             coeffs[i+blueIdx*3] = _coeffs[i+6];
3977         }
3978
3979         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
3980         un = 4*whitept[0]*d;
3981         vn = 9*whitept[1]*d;
3982
3983         CV_Assert(whitept[1] == 1.f);
3984     }
3985
3986     void operator()(const float* src, float* dst, int n) const
3987     {
3988         int i, dcn = dstcn;
3989         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
3990         float gscale = GammaTabScale;
3991         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3992               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3993               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3994         float alpha = ColorChannel<float>::max();
3995         float _un = un, _vn = vn;
3996         n *= 3;
3997
3998         for( i = 0; i < n; i += 3, dst += dcn )
3999         {
4000             float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
4001             Y = (L + 16.f) * (1.f/116.f);
4002             Y = Y*Y*Y;
4003             d = (1.f/13.f)/L;
4004             u = u*d + _un;
4005             v = v*d + _vn;
4006             float iv = 1.f/v;
4007             X = 2.25f * u * Y * iv ;
4008             Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
4009
4010             float R = X*C0 + Y*C1 + Z*C2;
4011             float G = X*C3 + Y*C4 + Z*C5;
4012             float B = X*C6 + Y*C7 + Z*C8;
4013
4014             R = std::min(std::max(R, 0.f), 1.f);
4015             G = std::min(std::max(G, 0.f), 1.f);
4016             B = std::min(std::max(B, 0.f), 1.f);
4017
4018             if( gammaTab )
4019             {
4020                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
4021                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
4022                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
4023             }
4024
4025             dst[0] = R; dst[1] = G; dst[2] = B;
4026             if( dcn == 4 )
4027                 dst[3] = alpha;
4028         }
4029     }
4030
4031     int dstcn;
4032     float coeffs[9], un, vn;
4033     bool srgb;
4034 };
4035
4036
4037 struct RGB2Luv_b
4038 {
4039     typedef uchar channel_type;
4040
4041     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
4042                const float* _whitept, bool _srgb )
4043     : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb)
4044     {
4045         #if CV_NEON
4046         v_scale_inv = vdupq_n_f32(1.f/255.f);
4047         v_scale = vdupq_n_f32(2.55f);
4048         v_coeff1 = vdupq_n_f32(0.72033898305084743f);
4049         v_coeff2 = vdupq_n_f32(96.525423728813564f);
4050         v_coeff3 = vdupq_n_f32(0.9732824427480916f);
4051         v_coeff4 = vdupq_n_f32(136.259541984732824f);
4052         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4053         #endif
4054     }
4055
4056     void operator()(const uchar* src, uchar* dst, int n) const
4057     {
4058         int i, j, scn = srccn;
4059         float buf[3*BLOCK_SIZE];
4060
4061         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
4062         {
4063             int dn = std::min(n - i, (int)BLOCK_SIZE);
4064             j = 0;
4065
4066             #if CV_NEON
4067             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
4068             {
4069                 uint16x8_t v_t0, v_t1, v_t2;
4070
4071                 if (scn == 3)
4072                 {
4073                     uint8x8x3_t v_src = vld3_u8(src);
4074                     v_t0 = vmovl_u8(v_src.val[0]);
4075                     v_t1 = vmovl_u8(v_src.val[1]);
4076                     v_t2 = vmovl_u8(v_src.val[2]);
4077                 }
4078                 else
4079                 {
4080                     uint8x8x4_t v_src = vld4_u8(src);
4081                     v_t0 = vmovl_u8(v_src.val[0]);
4082                     v_t1 = vmovl_u8(v_src.val[1]);
4083                     v_t2 = vmovl_u8(v_src.val[2]);
4084                 }
4085
4086                 float32x4x3_t v_dst;
4087                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
4088                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4089                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4090                 vst3q_f32(buf + j, v_dst);
4091
4092                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
4093                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4094                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4095                 vst3q_f32(buf + j + 12, v_dst);
4096             }
4097             #endif
4098             for( ; j < dn*3; j += 3, src += scn )
4099             {
4100                 buf[j] = src[0]*(1.f/255.f);
4101                 buf[j+1] = (float)(src[1]*(1.f/255.f));
4102                 buf[j+2] = (float)(src[2]*(1.f/255.f));
4103             }
4104             cvt(buf, buf, dn);
4105
4106             j = 0;
4107             #if CV_NEON
4108             for ( ; j <= (dn - 8) * 3; j += 24)
4109             {
4110                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4111
4112                 uint8x8x3_t v_dst;
4113                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4114                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4115                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
4116                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
4117                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
4118                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
4119
4120                 vst3_u8(dst + j, v_dst);
4121             }
4122             #endif
4123
4124             for( ; j < dn*3; j += 3 )
4125             {
4126                 dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
4127                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
4128                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.9732824427480916f + 136.259541984732824f);
4129             }
4130         }
4131     }
4132
4133     int srccn;
4134     RGB2Luv_f cvt;
4135
4136     #if CV_NEON
4137     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
4138     uint8x8_t v_alpha;
4139     #endif
4140 };
4141
4142
4143 struct Luv2RGB_b
4144 {
4145     typedef uchar channel_type;
4146
4147     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
4148                const float* _whitept, bool _srgb )
4149     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
4150     {
4151         #if CV_NEON
4152         v_scale_inv = vdupq_n_f32(100.f/255.f);
4153         v_coeff1 = vdupq_n_f32(1.388235294117647f);
4154         v_coeff2 = vdupq_n_f32(1.027450980392157f);
4155         v_134 = vdupq_n_f32(134.f);
4156         v_140 = vdupq_n_f32(140.f);
4157         v_scale = vdupq_n_f32(255.f);
4158         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4159         #endif
4160     }
4161
4162     void operator()(const uchar* src, uchar* dst, int n) const
4163     {
4164         int i, j, dcn = dstcn;
4165         uchar alpha = ColorChannel<uchar>::max();
4166         float buf[3*BLOCK_SIZE];
4167
4168         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
4169         {
4170             int dn = std::min(n - i, (int)BLOCK_SIZE);
4171             j = 0;
4172
4173             #if CV_NEON
4174             for ( ; j <= (dn - 8) * 3; j += 24)
4175             {
4176                 uint8x8x3_t v_src = vld3_u8(src + j);
4177                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
4178                            v_t1 = vmovl_u8(v_src.val[1]),
4179                            v_t2 = vmovl_u8(v_src.val[2]);
4180
4181                 float32x4x3_t v_dst;
4182                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
4183                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134);
4184                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140);
4185                 vst3q_f32(buf + j, v_dst);
4186
4187                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
4188                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134);
4189                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
4190                 vst3q_f32(buf + j + 12, v_dst);
4191             }
4192             #endif
4193             for( ; j < dn*3; j += 3 )
4194             {
4195                 buf[j] = src[j]*(100.f/255.f);
4196                 buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
4197                 buf[j+2] = (float)(src[j+2]*1.027450980392157f - 140.f);
4198             }
4199             cvt(buf, buf, dn);
4200
4201             j = 0;
4202             #if CV_NEON
4203             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
4204             {
4205                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4206                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4207                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4208                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4209                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4210                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4211                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4212
4213                 if (dcn == 4)
4214                 {
4215                     uint8x8x4_t v_dst;
4216                     v_dst.val[0] = v_dst0;
4217                     v_dst.val[1] = v_dst1;
4218                     v_dst.val[2] = v_dst2;
4219                     v_dst.val[3] = v_alpha;
4220                     vst4_u8(dst, v_dst);
4221                 }
4222                 else
4223                 {
4224                     uint8x8x3_t v_dst;
4225                     v_dst.val[0] = v_dst0;
4226                     v_dst.val[1] = v_dst1;
4227                     v_dst.val[2] = v_dst2;
4228                     vst3_u8(dst, v_dst);
4229                 }
4230             }
4231             #endif
4232
4233             for( ; j < dn*3; j += 3, dst += dcn )
4234             {
4235                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
4236                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
4237                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
4238                 if( dcn == 4 )
4239                     dst[3] = alpha;
4240             }
4241         }
4242     }
4243
4244     int dstcn;
4245     Luv2RGB_f cvt;
4246
4247     #if CV_NEON
4248     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
4249     uint8x8_t v_alpha;
4250     #endif
4251 };
4252
4253
4254 ///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
4255
4256 const int ITUR_BT_601_CY = 1220542;
4257 const int ITUR_BT_601_CUB = 2116026;
4258 const int ITUR_BT_601_CUG = -409993;
4259 const int ITUR_BT_601_CVG = -852492;
4260 const int ITUR_BT_601_CVR = 1673527;
4261 const int ITUR_BT_601_SHIFT = 20;
4262
4263 // Coefficients for RGB to YUV420p conversion
4264 const int ITUR_BT_601_CRY =  269484;
4265 const int ITUR_BT_601_CGY =  528482;
4266 const int ITUR_BT_601_CBY =  102760;
4267 const int ITUR_BT_601_CRU = -155188;
4268 const int ITUR_BT_601_CGU = -305135;
4269 const int ITUR_BT_601_CBU =  460324;
4270 const int ITUR_BT_601_CGV = -385875;
4271 const int ITUR_BT_601_CBV = -74448;
4272
4273 template<int bIdx, int uIdx>
4274 struct YUV420sp2RGB888Invoker : ParallelLoopBody
4275 {
4276     Mat* dst;
4277     const uchar* my1, *muv;
4278     int width, stride;
4279
4280     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
4281         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
4282
4283     void operator()(const Range& range) const
4284     {
4285         int rangeBegin = range.start * 2;
4286         int rangeEnd = range.end * 2;
4287
4288         //R = 1.164(Y - 16) + 1.596(V - 128)
4289         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
4290         //B = 1.164(Y - 16)                  + 2.018(U - 128)
4291
4292         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
4293         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
4294         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
4295
4296         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
4297
4298 #ifdef HAVE_TEGRA_OPTIMIZATION
4299         if(tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
4300             return;
4301 #endif
4302
4303         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
4304         {
4305             uchar* row1 = dst->ptr<uchar>(j);
4306             uchar* row2 = dst->ptr<uchar>(j + 1);
4307             const uchar* y2 = y1 + stride;
4308
4309             for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
4310             {
4311                 int u = int(uv[i + 0 + uIdx]) - 128;
4312                 int v = int(uv[i + 1 - uIdx]) - 128;
4313
4314                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4315                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4316                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4317
4318                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
4319                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4320                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4321                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4322
4323                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
4324                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4325                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4326                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4327
4328                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
4329                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4330                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4331                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4332
4333                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
4334                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4335                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4336                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4337             }
4338         }
4339     }
4340 };
4341
4342 template<int bIdx, int uIdx>
4343 struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
4344 {
4345     Mat* dst;
4346     const uchar* my1, *muv;
4347     int width, stride;
4348
4349     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
4350         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
4351
4352     void operator()(const Range& range) const
4353     {
4354         int rangeBegin = range.start * 2;
4355         int rangeEnd = range.end * 2;
4356
4357         //R = 1.164(Y - 16) + 1.596(V - 128)
4358         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
4359         //B = 1.164(Y - 16)                  + 2.018(U - 128)
4360
4361         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
4362         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
4363         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
4364
4365         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
4366
4367 #ifdef HAVE_TEGRA_OPTIMIZATION
4368         if(tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
4369             return;
4370 #endif
4371
4372         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
4373         {
4374             uchar* row1 = dst->ptr<uchar>(j);
4375             uchar* row2 = dst->ptr<uchar>(j + 1);
4376             const uchar* y2 = y1 + stride;
4377
4378             for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
4379             {
4380                 int u = int(uv[i + 0 + uIdx]) - 128;
4381                 int v = int(uv[i + 1 - uIdx]) - 128;
4382
4383                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4384                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4385                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4386
4387                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
4388                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4389                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4390                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4391                 row1[3]      = uchar(0xff);
4392
4393                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
4394                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4395                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4396                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4397                 row1[7]      = uchar(0xff);
4398
4399                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
4400                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4401                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4402                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4403                 row2[3]      = uchar(0xff);
4404
4405                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
4406                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4407                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4408                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4409                 row2[7]      = uchar(0xff);
4410             }
4411         }
4412     }
4413 };
4414
4415 template<int bIdx>
4416 struct YUV420p2RGB888Invoker : ParallelLoopBody
4417 {
4418     Mat* dst;
4419     const uchar* my1, *mu, *mv;
4420     int width, stride;
4421     int ustepIdx, vstepIdx;
4422
4423     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
4424         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
4425
4426     void operator()(const Range& range) const
4427     {
4428         const int rangeBegin = range.start * 2;
4429         const int rangeEnd = range.end * 2;
4430
4431         int uvsteps[2] = {width/2, stride - width/2};
4432         int usIdx = ustepIdx, vsIdx = vstepIdx;
4433
4434         const uchar* y1 = my1 + rangeBegin * stride;
4435         const uchar* u1 = mu + (range.start / 2) * stride;
4436         const uchar* v1 = mv + (range.start / 2) * stride;
4437
4438         if(range.start % 2 == 1)
4439         {
4440             u1 += uvsteps[(usIdx++) & 1];
4441             v1 += uvsteps[(vsIdx++) & 1];
4442         }
4443
4444         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
4445         {
4446             uchar* row1 = dst->ptr<uchar>(j);
4447             uchar* row2 = dst->ptr<uchar>(j + 1);
4448             const uchar* y2 = y1 + stride;
4449
4450             for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
4451             {
4452                 int u = int(u1[i]) - 128;
4453                 int v = int(v1[i]) - 128;
4454
4455                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4456                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4457                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4458
4459                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
4460                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4461                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4462                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4463
4464                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4465                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4466                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4467                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4468
4469                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
4470                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4471                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4472                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4473
4474                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4475                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4476                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4477                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4478             }
4479         }
4480     }
4481 };
4482
4483 template<int bIdx>
4484 struct YUV420p2RGBA8888Invoker : ParallelLoopBody
4485 {
4486     Mat* dst;
4487     const uchar* my1, *mu, *mv;
4488     int width, stride;
4489     int ustepIdx, vstepIdx;
4490
4491     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
4492         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
4493
4494     void operator()(const Range& range) const
4495     {
4496         int rangeBegin = range.start * 2;
4497         int rangeEnd = range.end * 2;
4498
4499         int uvsteps[2] = {width/2, stride - width/2};
4500         int usIdx = ustepIdx, vsIdx = vstepIdx;
4501
4502         const uchar* y1 = my1 + rangeBegin * stride;
4503         const uchar* u1 = mu + (range.start / 2) * stride;
4504         const uchar* v1 = mv + (range.start / 2) * stride;
4505
4506         if(range.start % 2 == 1)
4507         {
4508             u1 += uvsteps[(usIdx++) & 1];
4509             v1 += uvsteps[(vsIdx++) & 1];
4510         }
4511
4512         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
4513         {
4514             uchar* row1 = dst->ptr<uchar>(j);
4515             uchar* row2 = dst->ptr<uchar>(j + 1);
4516             const uchar* y2 = y1 + stride;
4517
4518             for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
4519             {
4520                 int u = int(u1[i]) - 128;
4521                 int v = int(v1[i]) - 128;
4522
4523                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4524                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4525                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4526
4527                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
4528                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4529                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4530                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4531                 row1[3]      = uchar(0xff);
4532
4533                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4534                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4535                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4536                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4537                 row1[7]      = uchar(0xff);
4538
4539                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
4540                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
4541                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
4542                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
4543                 row2[3]      = uchar(0xff);
4544
4545                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
4546                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
4547                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
4548                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
4549                 row2[7]      = uchar(0xff);
4550             }
4551         }
4552     }
4553 };
4554
4555 #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
4556
4557 template<int bIdx, int uIdx>
4558 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
4559 {
4560     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
4561     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4562         parallel_for_(Range(0, _dst.rows/2), converter);
4563     else
4564         converter(Range(0, _dst.rows/2));
4565 }
4566
4567 template<int bIdx, int uIdx>
4568 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
4569 {
4570     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
4571     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4572         parallel_for_(Range(0, _dst.rows/2), converter);
4573     else
4574         converter(Range(0, _dst.rows/2));
4575 }
4576
4577 template<int bIdx>
4578 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
4579 {
4580     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
4581     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4582         parallel_for_(Range(0, _dst.rows/2), converter);
4583     else
4584         converter(Range(0, _dst.rows/2));
4585 }
4586
4587 template<int bIdx>
4588 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
4589 {
4590     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
4591     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
4592         parallel_for_(Range(0, _dst.rows/2), converter);
4593     else
4594         converter(Range(0, _dst.rows/2));
4595 }
4596
4597 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
4598
4599 template<int bIdx>
4600 struct RGB888toYUV420pInvoker: public ParallelLoopBody
4601 {
4602     RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
4603         : src_(src),
4604           dst_(dst),
4605           uIdx_(uIdx) { }
4606
4607     void operator()(const Range& rowRange) const
4608     {
4609         const int w = src_.cols;
4610         const int h = src_.rows;
4611
4612         const int cn = src_.channels();
4613         for( int i = rowRange.start; i < rowRange.end; i++ )
4614         {
4615             const uchar* row0 = src_.ptr<uchar>(2 * i);
4616             const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
4617
4618             uchar* y = dst_->ptr<uchar>(2*i);
4619             uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
4620             uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
4621             if( uIdx_ == 2 ) std::swap(u, v);
4622
4623             for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
4624             {
4625                 int r00 = row0[2-bIdx + j];      int g00 = row0[1 + j];      int b00 = row0[bIdx + j];
4626                 int r01 = row0[2-bIdx + cn + j]; int g01 = row0[1 + cn + j]; int b01 = row0[bIdx + cn + j];
4627                 int r10 = row1[2-bIdx + j];      int g10 = row1[1 + j];      int b10 = row1[bIdx + j];
4628                 int r11 = row1[2-bIdx + cn + j]; int g11 = row1[1 + cn + j]; int b11 = row1[bIdx + cn + j];
4629
4630                 const int shifted16 = (16 << ITUR_BT_601_SHIFT);
4631                 const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
4632                 int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
4633                 int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
4634                 int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
4635                 int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
4636
4637                 y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
4638                 y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
4639                 y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
4640                 y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
4641
4642                 const int shifted128 = (128 << ITUR_BT_601_SHIFT);
4643                 int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
4644                 int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
4645
4646                 u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
4647                 v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
4648             }
4649         }
4650     }
4651
4652     static bool isFit( const Mat& src )
4653     {
4654         return (src.total() >= 320*240);
4655     }
4656
4657 private:
4658     RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
4659
4660     const Mat& src_;
4661     Mat* const dst_;
4662     const int uIdx_;
4663 };
4664
4665 template<int bIdx, int uIdx>
4666 static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
4667 {
4668     RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
4669     if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
4670         parallel_for_(Range(0, src.rows/2), colorConverter);
4671     else
4672         colorConverter(Range(0, src.rows/2));
4673 }
4674
4675 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
4676
4677 template<int bIdx, int uIdx, int yIdx>
4678 struct YUV422toRGB888Invoker : ParallelLoopBody
4679 {
4680     Mat* dst;
4681     const uchar* src;
4682     int width, stride;
4683
4684     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
4685         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
4686
4687     void operator()(const Range& range) const
4688     {
4689         int rangeBegin = range.start;
4690         int rangeEnd = range.end;
4691
4692         const int uidx = 1 - yIdx + uIdx * 2;
4693         const int vidx = (2 + uidx) % 4;
4694         const uchar* yuv_src = src + rangeBegin * stride;
4695
4696         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
4697         {
4698             uchar* row = dst->ptr<uchar>(j);
4699
4700             for (int i = 0; i < 2 * width; i += 4, row += 6)
4701             {
4702                 int u = int(yuv_src[i + uidx]) - 128;
4703                 int v = int(yuv_src[i + vidx]) - 128;
4704
4705                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4706                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4707                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4708
4709                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
4710                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4711                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4712                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4713
4714                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
4715                 row[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4716                 row[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4717                 row[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4718             }
4719         }
4720     }
4721 };
4722
4723 template<int bIdx, int uIdx, int yIdx>
4724 struct YUV422toRGBA8888Invoker : ParallelLoopBody
4725 {
4726     Mat* dst;
4727     const uchar* src;
4728     int width, stride;
4729
4730     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
4731         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
4732
4733     void operator()(const Range& range) const
4734     {
4735         int rangeBegin = range.start;
4736         int rangeEnd = range.end;
4737
4738         const int uidx = 1 - yIdx + uIdx * 2;
4739         const int vidx = (2 + uidx) % 4;
4740         const uchar* yuv_src = src + rangeBegin * stride;
4741
4742         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
4743         {
4744             uchar* row = dst->ptr<uchar>(j);
4745
4746             for (int i = 0; i < 2 * width; i += 4, row += 8)
4747             {
4748                 int u = int(yuv_src[i + uidx]) - 128;
4749                 int v = int(yuv_src[i + vidx]) - 128;
4750
4751                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
4752                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
4753                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
4754
4755                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
4756                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
4757                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
4758                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
4759                 row[3]      = uchar(0xff);
4760
4761                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
4762                 row[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
4763                 row[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
4764                 row[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
4765                 row[7]      = uchar(0xff);
4766             }
4767         }
4768     }
4769 };
4770
4771 #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
4772
4773 template<int bIdx, int uIdx, int yIdx>
4774 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
4775 {
4776     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
4777     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
4778         parallel_for_(Range(0, _dst.rows), converter);
4779     else
4780         converter(Range(0, _dst.rows));
4781 }
4782
4783 template<int bIdx, int uIdx, int yIdx>
4784 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
4785 {
4786     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
4787     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
4788         parallel_for_(Range(0, _dst.rows), converter);
4789     else
4790         converter(Range(0, _dst.rows));
4791 }
4792
4793 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
4794
4795 template<typename _Tp>
4796 struct RGBA2mRGBA
4797 {
4798     typedef _Tp channel_type;
4799
4800     void operator()(const _Tp* src, _Tp* dst, int n) const
4801     {
4802         _Tp max_val  = ColorChannel<_Tp>::max();
4803         _Tp half_val = ColorChannel<_Tp>::half();
4804         for( int i = 0; i < n; i++ )
4805         {
4806             _Tp v0 = *src++;
4807             _Tp v1 = *src++;
4808             _Tp v2 = *src++;
4809             _Tp v3 = *src++;
4810
4811             *dst++ = (v0 * v3 + half_val) / max_val;
4812             *dst++ = (v1 * v3 + half_val) / max_val;
4813             *dst++ = (v2 * v3 + half_val) / max_val;
4814             *dst++ = v3;
4815         }
4816     }
4817 };
4818
4819
4820 template<typename _Tp>
4821 struct mRGBA2RGBA
4822 {
4823     typedef _Tp channel_type;
4824
4825     void operator()(const _Tp* src, _Tp* dst, int n) const
4826     {
4827         _Tp max_val = ColorChannel<_Tp>::max();
4828         for( int i = 0; i < n; i++ )
4829         {
4830             _Tp v0 = *src++;
4831             _Tp v1 = *src++;
4832             _Tp v2 = *src++;
4833             _Tp v3 = *src++;
4834             _Tp v3_half = v3 / 2;
4835
4836             *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
4837             *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
4838             *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
4839             *dst++ = v3;
4840         }
4841     }
4842 };
4843
4844 #ifdef HAVE_OPENCL
4845
4846 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
4847 {
4848     bool ok = false;
4849     UMat src = _src.getUMat(), dst;
4850     Size sz = src.size(), dstSz = sz;
4851     int scn = src.channels(), depth = src.depth(), bidx, uidx, yidx;
4852     int dims = 2, stripeSize = 1;
4853     ocl::Kernel k;
4854
4855     if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
4856         return false;
4857
4858     ocl::Device dev = ocl::Device::getDefault();
4859     int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
4860     int pxPerWIx = 1;
4861
4862     size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
4863     cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
4864                              depth, scn, pxPerWIy);
4865
4866     switch (code)
4867     {
4868     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
4869     case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
4870     {
4871         CV_Assert(scn == 3 || scn == 4);
4872         dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
4873         bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
4874         k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
4875                  opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
4876                         reverse ? "REVERSE" : "ORDER"));
4877         break;
4878     }
4879     case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
4880     case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
4881     {
4882         dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
4883         CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
4884         bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
4885             code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
4886         int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
4887             code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
4888         k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
4889                  opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
4890         break;
4891     }
4892     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
4893     case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
4894     {
4895         CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
4896         bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
4897             code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
4898         int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
4899             code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
4900         dcn = 2;
4901         k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
4902                  opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
4903         break;
4904     }
4905     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
4906     {
4907         CV_Assert(scn == 2 && depth == CV_8U);
4908         dcn = 1;
4909         int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
4910         k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
4911                  opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
4912         break;
4913     }
4914     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
4915     {
4916         CV_Assert(scn == 1 && depth == CV_8U);
4917         dcn = 2;
4918         int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
4919         k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
4920                  opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
4921         break;
4922     }
4923     case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
4924     case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
4925     {
4926         CV_Assert(scn == 3 || scn == 4);
4927         bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
4928         dcn = 1;
4929         k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
4930                  opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
4931                                bidx, stripeSize));
4932         globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
4933         break;
4934     }
4935     case COLOR_GRAY2BGR:
4936     case COLOR_GRAY2BGRA:
4937     {
4938         CV_Assert(scn == 1);
4939         dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
4940         k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
4941                  opts + format("-D bidx=0 -D dcn=%d", dcn));
4942         break;
4943     }
4944     case COLOR_BGR2YUV:
4945     case COLOR_RGB2YUV:
4946     {
4947         CV_Assert(scn == 3 || scn == 4);
4948         bidx = code == COLOR_RGB2YUV ? 0 : 2;
4949         dcn = 3;
4950         k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
4951                  opts + format("-D dcn=3 -D bidx=%d", bidx));
4952         break;
4953     }
4954     case COLOR_YUV2BGR:
4955     case COLOR_YUV2RGB:
4956     {
4957         if(dcn < 0) dcn = 3;
4958         CV_Assert(dcn == 3 || dcn == 4);
4959         bidx = code == COLOR_YUV2RGB ? 0 : 2;
4960         k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
4961                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
4962         break;
4963     }
4964     case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV21:
4965     case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV21:
4966     {
4967         CV_Assert( scn == 1 );
4968         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
4969         dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ||
4970                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2RGBA_NV21 ? 4 : 3;
4971         bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ||
4972                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 0 : 2;
4973         uidx = code == COLOR_YUV2RGBA_NV21 || code == COLOR_YUV2RGB_NV21 ||
4974                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 1 : 0;
4975
4976         dstSz = Size(sz.width, sz.height * 2 / 3);
4977         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
4978         k.create("YUV2RGB_NVx", ocl::imgproc::cvtcolor_oclsrc,
4979                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
4980         break;
4981     }
4982     case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
4983     case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
4984     {
4985         CV_Assert( scn == 1 );
4986         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
4987         dcn  = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2RGBA_YV12 ||
4988                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2RGBA_IYUV ? 4 : 3;
4989         bidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
4990                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2BGR_IYUV ? 0 : 2;
4991         uidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
4992                code == COLOR_YUV2RGBA_YV12 || code == COLOR_YUV2RGB_YV12 ? 1 : 0;
4993
4994         dstSz = Size(sz.width, sz.height * 2 / 3);
4995         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
4996         k.create("YUV2RGB_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
4997                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
4998                  src.isContinuous() ? " -D SRC_CONT" : ""));
4999         break;
5000     }
5001     case COLOR_YUV2GRAY_420:
5002     {
5003         if (dcn <= 0) dcn = 1;
5004
5005         CV_Assert( dcn == 1 );
5006         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
5007
5008         dstSz = Size(sz.width, sz.height * 2 / 3);
5009         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5010         dst = _dst.getUMat();
5011
5012         src.rowRange(0, dstSz.height).copyTo(dst);
5013         return true;
5014     }
5015     case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
5016     case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
5017     {
5018         if (dcn <= 0) dcn = 1;
5019         bidx = code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ||
5020                code == COLOR_BGRA2YUV_IYUV || code == COLOR_BGR2YUV_IYUV ? 0 : 2;
5021         uidx = code == COLOR_RGBA2YUV_YV12 || code == COLOR_RGB2YUV_YV12 ||
5022                code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ? 1 : 0;
5023
5024         CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
5025         CV_Assert( dcn == 1 );
5026         CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
5027
5028         dstSz = Size(sz.width, sz.height / 2 * 3);
5029         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5030         dst = _dst.getUMat();
5031
5032         if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
5033             dst.step % 4 == 0 && dst.offset % 4 == 0)
5034         {
5035             pxPerWIx = 2;
5036         }
5037         globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
5038
5039         k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
5040                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
5041         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
5042         return k.run(2, globalsize, NULL, false);
5043     }
5044     case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
5045     case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
5046     case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
5047     {
5048         if (dcn <= 0)
5049             dcn = (code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2RGBA_YUY2 ||
5050                    code==COLOR_YUV2BGRA_YUY2 || code==COLOR_YUV2RGBA_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 4 : 3;
5051
5052         bidx = (code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2BGRA_YUY2 ||
5053                 code==COLOR_YUV2BGR_YUY2 || code==COLOR_YUV2BGRA_YVYU || code==COLOR_YUV2BGR_YVYU) ? 0 : 2;
5054         yidx = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
5055         uidx = (code==COLOR_YUV2RGB_YVYU || code==COLOR_YUV2RGBA_YVYU ||
5056                 code==COLOR_YUV2BGR_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 2 : 0;
5057         uidx = 1 - yidx + uidx;
5058
5059         CV_Assert( dcn == 3 || dcn == 4 );
5060         CV_Assert( scn == 2 && depth == CV_8U );
5061
5062         k.create("YUV2RGB_422", ocl::imgproc::cvtcolor_oclsrc,
5063                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
5064                                 src.offset % 4 == 0 && src.step % 4 == 0 ? " -D USE_OPTIMIZED_LOAD" : ""));
5065         break;
5066     }
5067     case COLOR_BGR2YCrCb:
5068     case COLOR_RGB2YCrCb:
5069     {
5070         CV_Assert(scn == 3 || scn == 4);
5071         bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
5072         dcn = 3;
5073         k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
5074                  opts + format("-D dcn=3 -D bidx=%d", bidx));
5075         break;
5076     }
5077     case COLOR_YCrCb2BGR:
5078     case COLOR_YCrCb2RGB:
5079     {
5080         if( dcn <= 0 )
5081             dcn = 3;
5082         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
5083         bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
5084         k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
5085                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
5086         break;
5087     }
5088     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
5089     {
5090         CV_Assert(scn == 3 || scn == 4);
5091         bidx = code == COLOR_BGR2XYZ ? 0 : 2;
5092
5093         UMat c;
5094         if (depth == CV_32F)
5095         {
5096             float coeffs[] =
5097             {
5098                 0.412453f, 0.357580f, 0.180423f,
5099                 0.212671f, 0.715160f, 0.072169f,
5100                 0.019334f, 0.119193f, 0.950227f
5101             };
5102             if (bidx == 0)
5103             {
5104                 std::swap(coeffs[0], coeffs[2]);
5105                 std::swap(coeffs[3], coeffs[5]);
5106                 std::swap(coeffs[6], coeffs[8]);
5107             }
5108             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
5109         }
5110         else
5111         {
5112             int coeffs[] =
5113             {
5114                 1689,    1465,    739,
5115                 871,     2929,    296,
5116                 79,      488,     3892
5117             };
5118             if (bidx == 0)
5119             {
5120                 std::swap(coeffs[0], coeffs[2]);
5121                 std::swap(coeffs[3], coeffs[5]);
5122                 std::swap(coeffs[6], coeffs[8]);
5123             }
5124             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
5125         }
5126
5127         _dst.create(dstSz, CV_MAKETYPE(depth, 3));
5128         dst = _dst.getUMat();
5129
5130         k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
5131                  opts + format("-D dcn=3 -D bidx=%d", bidx));
5132         if (k.empty())
5133             return false;
5134         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
5135         return k.run(2, globalsize, 0, false);
5136     }
5137     case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
5138     {
5139         if (dcn <= 0)
5140             dcn = 3;
5141         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
5142         bidx = code == COLOR_XYZ2BGR ? 0 : 2;
5143
5144         UMat c;
5145         if (depth == CV_32F)
5146         {
5147             float coeffs[] =
5148             {
5149                 3.240479f, -1.53715f, -0.498535f,
5150                 -0.969256f, 1.875991f, 0.041556f,
5151                 0.055648f, -0.204043f, 1.057311f
5152             };
5153             if (bidx == 0)
5154             {
5155                 std::swap(coeffs[0], coeffs[6]);
5156                 std::swap(coeffs[1], coeffs[7]);
5157                 std::swap(coeffs[2], coeffs[8]);
5158             }
5159             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
5160         }
5161         else
5162         {
5163             int coeffs[] =
5164             {
5165                 13273,  -6296,  -2042,
5166                 -3970,   7684,    170,
5167                   228,   -836,   4331
5168             };
5169             if (bidx == 0)
5170             {
5171                 std::swap(coeffs[0], coeffs[6]);
5172                 std::swap(coeffs[1], coeffs[7]);
5173                 std::swap(coeffs[2], coeffs[8]);
5174             }
5175             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
5176         }
5177
5178         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5179         dst = _dst.getUMat();
5180
5181         k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
5182                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
5183         if (k.empty())
5184             return false;
5185         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
5186         return k.run(2, globalsize, 0, false);
5187     }
5188     case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
5189     case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
5190     {
5191         CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
5192         bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
5193             code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
5194         int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
5195             code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
5196         bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
5197         String kernelName = String("RGB2") + (is_hsv ? "HSV" : "HLS");
5198         dcn = 3;
5199
5200         if (is_hsv && depth == CV_8U)
5201         {
5202             static UMat sdiv_data;
5203             static UMat hdiv_data180;
5204             static UMat hdiv_data256;
5205             static int sdiv_table[256];
5206             static int hdiv_table180[256];
5207             static int hdiv_table256[256];
5208             static volatile bool initialized180 = false, initialized256 = false;
5209             volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
5210
5211             if (!initialized)
5212             {
5213                 int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
5214                 UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
5215
5216                 sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
5217
5218                 int v = 255 << hsv_shift;
5219                 if (!initialized180 && !initialized256)
5220                 {
5221                     for(int i = 1; i < 256; i++ )
5222                         sdiv_table[i] = saturate_cast<int>(v/(1.*i));
5223                     Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
5224                 }
5225
5226                 v = hrange << hsv_shift;
5227                 for (int i = 1; i < 256; i++ )
5228                     hdiv_table[i] = saturate_cast<int>(v/(6.*i));
5229
5230                 Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
5231                 initialized = true;
5232             }
5233
5234             _dst.create(dstSz, CV_8UC3);
5235             dst = _dst.getUMat();
5236
5237             k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
5238                      opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
5239                                    hrange, bidx));
5240             if (k.empty())
5241                 return false;
5242
5243             k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
5244                    ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
5245                                                                        ocl::KernelArg::PtrReadOnly(hdiv_data180));
5246
5247             return k.run(2, globalsize, NULL, false);
5248         }
5249         else
5250             k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
5251                      opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
5252                                    hrange*(1.f/360.f), bidx));
5253         break;
5254     }
5255     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
5256     case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
5257     {
5258         if (dcn <= 0)
5259             dcn = 3;
5260         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
5261         bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
5262             code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
5263         int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
5264             code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
5265         bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
5266                 code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
5267
5268         String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
5269         k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
5270                  opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
5271                                dcn, bidx, hrange, 6.f/hrange));
5272         break;
5273     }
5274     case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
5275     {
5276         CV_Assert(scn == 4 && depth == CV_8U);
5277         dcn = 4;
5278
5279         k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
5280                  opts + "-D dcn=4 -D bidx=3");
5281         break;
5282     }
5283     case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
5284     case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
5285     {
5286         CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
5287
5288         bidx = code == CV_BGR2Lab || code == CV_LBGR2Lab || code == CV_BGR2Luv || code == CV_LBGR2Luv ? 0 : 2;
5289         bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_RGB2Luv || code == CV_BGR2Luv;
5290         bool lab = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_LBGR2Lab || code == CV_LRGB2Lab;
5291         float un, vn;
5292         dcn = 3;
5293
5294         k.create(format("BGR2%s", lab ? "Lab" : "Luv").c_str(),
5295                  ocl::imgproc::cvtcolor_oclsrc,
5296                  opts + format("-D dcn=%d -D bidx=%d%s",
5297                                dcn, bidx, srgb ? " -D SRGB" : ""));
5298         if (k.empty())
5299             return false;
5300
5301         initLabTabs();
5302
5303         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5304         dst = _dst.getUMat();
5305
5306         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5307                 dstarg = ocl::KernelArg::WriteOnly(dst);
5308
5309         if (depth == CV_8U && lab)
5310         {
5311             static UMat usRGBGammaTab, ulinearGammaTab, uLabCbrtTab, ucoeffs;
5312
5313             if (srgb && usRGBGammaTab.empty())
5314                 Mat(1, 256, CV_16UC1, sRGBGammaTab_b).copyTo(usRGBGammaTab);
5315             else if (ulinearGammaTab.empty())
5316                 Mat(1, 256, CV_16UC1, linearGammaTab_b).copyTo(ulinearGammaTab);
5317             if (uLabCbrtTab.empty())
5318                 Mat(1, LAB_CBRT_TAB_SIZE_B, CV_16UC1, LabCbrtTab_b).copyTo(uLabCbrtTab);
5319
5320             {
5321                 int coeffs[9];
5322                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
5323                 const float scale[] =
5324                 {
5325                     (1 << lab_shift)/_whitept[0],
5326                     (float)(1 << lab_shift),
5327                     (1 << lab_shift)/_whitept[2]
5328                 };
5329
5330                 for (int i = 0; i < 3; i++ )
5331                 {
5332                     coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]);
5333                     coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
5334                     coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]);
5335
5336                     CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
5337                               coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
5338                 }
5339                 Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs);
5340             }
5341
5342             const int Lscale = (116*255+50)/100;
5343             const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
5344
5345             k.args(srcarg, dstarg,
5346                    ocl::KernelArg::PtrReadOnly(srgb ? usRGBGammaTab : ulinearGammaTab),
5347                    ocl::KernelArg::PtrReadOnly(uLabCbrtTab), ocl::KernelArg::PtrReadOnly(ucoeffs),
5348                    Lscale, Lshift);
5349         }
5350         else
5351         {
5352             static UMat usRGBGammaTab, ucoeffs, uLabCbrtTab;
5353
5354             if (srgb && usRGBGammaTab.empty())
5355                 Mat(1, GAMMA_TAB_SIZE * 4, CV_32FC1, sRGBGammaTab).copyTo(usRGBGammaTab);
5356             if (!lab && uLabCbrtTab.empty())
5357                 Mat(1, LAB_CBRT_TAB_SIZE * 4, CV_32FC1, LabCbrtTab).copyTo(uLabCbrtTab);
5358
5359             {
5360                 float coeffs[9];
5361                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
5362                 float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
5363
5364                 for (int i = 0; i < 3; i++)
5365                 {
5366                     int j = i * 3;
5367                     coeffs[j + (bidx ^ 2)] = _coeffs[j] * (lab ? scale[i] : 1);
5368                     coeffs[j + 1] = _coeffs[j + 1] * (lab ? scale[i] : 1);
5369                     coeffs[j + bidx] = _coeffs[j + 2] * (lab ? scale[i] : 1);
5370
5371                     CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
5372                                coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) );
5373                 }
5374
5375                 float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
5376                 un = 13*4*_whitept[0]*d;
5377                 vn = 13*9*_whitept[1]*d;
5378
5379                 Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
5380             }
5381
5382             float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f;
5383             ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
5384
5385             if (lab)
5386             {
5387                 if (srgb)
5388                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
5389                            ucoeffsarg, _1_3, _a);
5390                 else
5391                     k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a);
5392             }
5393             else
5394             {
5395                 ocl::KernelArg LabCbrtTabarg = ocl::KernelArg::PtrReadOnly(uLabCbrtTab);
5396                 if (srgb)
5397                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
5398                            LabCbrtTabarg, ucoeffsarg, un, vn);
5399                 else
5400                     k.args(srcarg, dstarg, LabCbrtTabarg, ucoeffsarg, un, vn);
5401             }
5402         }
5403
5404         return k.run(dims, globalsize, NULL, false);
5405     }
5406     case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
5407     case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
5408     {
5409         if( dcn <= 0 )
5410             dcn = 3;
5411         CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
5412
5413         bidx = code == CV_Lab2BGR || code == CV_Lab2LBGR || code == CV_Luv2BGR || code == CV_Luv2LBGR ? 0 : 2;
5414         bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Luv2BGR || code == CV_Luv2RGB;
5415         bool lab = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Lab2LBGR || code == CV_Lab2LRGB;
5416         float un, vn;
5417
5418         k.create(format("%s2BGR", lab ? "Lab" : "Luv").c_str(),
5419                  ocl::imgproc::cvtcolor_oclsrc,
5420                  opts + format("-D dcn=%d -D bidx=%d%s",
5421                                dcn, bidx, srgb ? " -D SRGB" : ""));
5422         if (k.empty())
5423             return false;
5424
5425         initLabTabs();
5426         static UMat ucoeffs, usRGBInvGammaTab;
5427
5428         if (srgb && usRGBInvGammaTab.empty())
5429             Mat(1, GAMMA_TAB_SIZE*4, CV_32FC1, sRGBInvGammaTab).copyTo(usRGBInvGammaTab);
5430
5431         {
5432             float coeffs[9];
5433             const float * const _coeffs = XYZ2sRGB_D65, * const _whitept = D65;
5434
5435             for( int i = 0; i < 3; i++ )
5436             {
5437                 coeffs[i+(bidx^2)*3] = _coeffs[i] * (lab ? _whitept[i] : 1);
5438                 coeffs[i+3] = _coeffs[i+3] * (lab ? _whitept[i] : 1);
5439                 coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1);
5440             }
5441
5442             float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
5443             un = 4*_whitept[0]*d;
5444             vn = 9*_whitept[1]*d;
5445
5446             Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
5447         }
5448
5449         _dst.create(sz, CV_MAKETYPE(depth, dcn));
5450         dst = _dst.getUMat();
5451
5452         float lThresh = 0.008856f * 903.3f;
5453         float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
5454
5455         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5456                 dstarg = ocl::KernelArg::WriteOnly(dst),
5457                 coeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
5458
5459         if (lab)
5460         {
5461             if (srgb)
5462                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
5463                        coeffsarg, lThresh, fThresh);
5464             else
5465                 k.args(srcarg, dstarg, coeffsarg, lThresh, fThresh);
5466         }
5467         else
5468         {
5469             if (srgb)
5470                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
5471                        coeffsarg, un, vn);
5472             else
5473                 k.args(srcarg, dstarg, coeffsarg, un, vn);
5474         }
5475
5476         return k.run(dims, globalsize, NULL, false);
5477     }
5478     default:
5479         break;
5480     }
5481
5482     if( !k.empty() )
5483     {
5484         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
5485         dst = _dst.getUMat();
5486         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
5487         ok = k.run(dims, globalsize, NULL, false);
5488     }
5489     return ok;
5490 }
5491
5492 #endif
5493
5494 }//namespace cv
5495
5496 //////////////////////////////////////////////////////////////////////////////////////////
5497 //                                   The main function                                  //
5498 //////////////////////////////////////////////////////////////////////////////////////////
5499
5500 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
5501 {
5502     int stype = _src.type();
5503     int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
5504
5505     CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() && !(depth == CV_8U && (code == CV_Luv2BGR || code == CV_Luv2RGB)),
5506                 ocl_cvtColor(_src, _dst, code, dcn) )
5507
5508     Mat src = _src.getMat(), dst;
5509     Size sz = src.size();
5510
5511     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
5512
5513     switch( code )
5514     {
5515         case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
5516         case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
5517             CV_Assert( scn == 3 || scn == 4 );
5518             dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
5519             bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
5520
5521             _dst.create( sz, CV_MAKETYPE(depth, dcn));
5522             dst = _dst.getMat();
5523
5524 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
5525             CV_IPP_CHECK()
5526             {
5527                 if( code == CV_BGR2BGRA)
5528                 {
5529                     if ( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
5530                     {
5531                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5532                         return;
5533                     }
5534                     setIppErrorStatus();
5535                 }
5536                 else if( code == CV_BGRA2BGR )
5537                 {
5538                     if ( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
5539                     {
5540                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5541                         return;
5542                     }
5543                     setIppErrorStatus();
5544                 }
5545                 else if( code == CV_BGR2RGBA )
5546                 {
5547                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
5548                     {
5549                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5550                         return;
5551                     }
5552                     setIppErrorStatus();
5553                 }
5554                 else if( code == CV_RGBA2BGR )
5555                 {
5556                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
5557                     {
5558                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5559                         return;
5560                     }
5561                     setIppErrorStatus();
5562                 }
5563                 else if( code == CV_RGB2BGR )
5564                 {
5565                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
5566                     {
5567                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5568                         return;
5569                     }
5570                     setIppErrorStatus();
5571                 }
5572 #if IPP_VERSION_X100 >= 801
5573                 else if( code == CV_RGBA2BGRA )
5574                 {
5575                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
5576                     {
5577                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5578                         return;
5579                     }
5580                     setIppErrorStatus();
5581                 }
5582 #endif
5583             }
5584 #endif
5585
5586             if( depth == CV_8U )
5587             {
5588 #ifdef HAVE_TEGRA_OPTIMIZATION
5589                 if(!tegra::cvtBGR2RGB(src, dst, bidx))
5590 #endif
5591                     CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
5592             }
5593             else if( depth == CV_16U )
5594                 CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
5595             else
5596                 CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
5597             break;
5598
5599         case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
5600         case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
5601             CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
5602             _dst.create(sz, CV_8UC2);
5603             dst = _dst.getMat();
5604
5605 #if defined(HAVE_IPP) && 0 // breaks OCL accuracy tests
5606             CV_IPP_CHECK()
5607             {
5608                 CV_SUPPRESS_DEPRECATED_START
5609
5610                 if (code == CV_BGR2BGR565 && scn == 3)
5611                 {
5612                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
5613                     {
5614                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5615                         return;
5616                     }
5617                     setIppErrorStatus();
5618                 }
5619                 else if (code == CV_BGRA2BGR565 && scn == 4)
5620                 {
5621                     if (CvtColorIPPLoopCopy(src, dst,
5622                                             IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5623                                             (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
5624                     {
5625                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5626                         return;
5627                     }
5628                     setIppErrorStatus();
5629                 }
5630                 else if (code == CV_RGB2BGR565 && scn == 3)
5631                 {
5632                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
5633                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
5634                     {
5635                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5636                         return;
5637                     }
5638                     setIppErrorStatus();
5639                 }
5640                 else if (code == CV_RGBA2BGR565 && scn == 4)
5641                 {
5642                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5643                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
5644                     {
5645                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5646                         return;
5647                     }
5648                     setIppErrorStatus();
5649                 }
5650                 CV_SUPPRESS_DEPRECATED_END
5651             }
5652 #endif
5653
5654 #ifdef HAVE_TEGRA_OPTIMIZATION
5655             if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
5656                 if(tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
5657                     break;
5658 #endif
5659
5660             CvtColorLoop(src, dst, RGB2RGB5x5(scn,
5661                       code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
5662                       code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
5663                       code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
5664                       code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
5665                                               ));
5666             break;
5667
5668         case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
5669         case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
5670             if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
5671             CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
5672             _dst.create(sz, CV_MAKETYPE(depth, dcn));
5673             dst = _dst.getMat();
5674
5675 #ifdef HAVE_IPP
5676             CV_IPP_CHECK()
5677             {
5678                 CV_SUPPRESS_DEPRECATED_START
5679                 if (code == CV_BGR5652BGR && dcn == 3)
5680                 {
5681                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
5682                     {
5683                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5684                         return;
5685                     }
5686                     setIppErrorStatus();
5687                 }
5688                 else if (code == CV_BGR5652RGB && dcn == 3)
5689                 {
5690                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
5691                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
5692                     {
5693                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5694                         return;
5695                     }
5696                     setIppErrorStatus();
5697                 }
5698                 else if (code == CV_BGR5652BGRA && dcn == 4)
5699                 {
5700                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
5701                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
5702                     {
5703                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5704                         return;
5705                     }
5706                     setIppErrorStatus();
5707                 }
5708                 else if (code == CV_BGR5652RGBA && dcn == 4)
5709                 {
5710                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
5711                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
5712                     {
5713                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5714                         return;
5715                     }
5716                     setIppErrorStatus();
5717                 }
5718                 CV_SUPPRESS_DEPRECATED_END
5719             }
5720 #endif
5721
5722             CvtColorLoop(src, dst, RGB5x52RGB(dcn,
5723                       code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
5724                       code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
5725                       code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
5726                       code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
5727                       ));
5728             break;
5729
5730         case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
5731             CV_Assert( scn == 3 || scn == 4 );
5732             _dst.create(sz, CV_MAKETYPE(depth, 1));
5733             dst = _dst.getMat();
5734
5735 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
5736             CV_IPP_CHECK()
5737             {
5738                 if( code == CV_BGR2GRAY && depth == CV_32F )
5739                 {
5740                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
5741                     {
5742                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5743                         return;
5744                     }
5745                     setIppErrorStatus();
5746                 }
5747                 else if( code == CV_RGB2GRAY && depth == CV_32F )
5748                 {
5749                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
5750                     {
5751                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5752                         return;
5753                     }
5754                     setIppErrorStatus();
5755                 }
5756                 else if( code == CV_BGRA2GRAY && depth == CV_32F )
5757                 {
5758                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
5759                     {
5760                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5761                         return;
5762                     }
5763                     setIppErrorStatus();
5764                 }
5765                 else if( code == CV_RGBA2GRAY && depth == CV_32F )
5766                 {
5767                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
5768                     {
5769                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5770                         return;
5771                     }
5772                     setIppErrorStatus();
5773                 }
5774             }
5775 #endif
5776
5777             bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
5778
5779             if( depth == CV_8U )
5780             {
5781 #ifdef HAVE_TEGRA_OPTIMIZATION
5782                 if(!tegra::cvtRGB2Gray(src, dst, bidx))
5783 #endif
5784                 CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
5785             }
5786             else if( depth == CV_16U )
5787                 CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
5788             else
5789                 CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
5790             break;
5791
5792         case CV_BGR5652GRAY: case CV_BGR5552GRAY:
5793             CV_Assert( scn == 2 && depth == CV_8U );
5794             _dst.create(sz, CV_8UC1);
5795             dst = _dst.getMat();
5796
5797             CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
5798             break;
5799
5800         case CV_GRAY2BGR: case CV_GRAY2BGRA:
5801             if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
5802             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
5803             _dst.create(sz, CV_MAKETYPE(depth, dcn));
5804             dst = _dst.getMat();
5805
5806 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
5807             CV_IPP_CHECK()
5808             {
5809                 if( code == CV_GRAY2BGR )
5810                 {
5811                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
5812                     {
5813                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5814                         return;
5815                     }
5816                     setIppErrorStatus();
5817                 }
5818                 else if( code == CV_GRAY2BGRA )
5819                 {
5820                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
5821                     {
5822                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5823                         return;
5824                     }
5825                     setIppErrorStatus();
5826                 }
5827             }
5828 #endif
5829
5830
5831             if( depth == CV_8U )
5832             {
5833 #ifdef HAVE_TEGRA_OPTIMIZATION
5834                 if(!tegra::cvtGray2RGB(src, dst))
5835 #endif
5836                 CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
5837             }
5838             else if( depth == CV_16U )
5839                 CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
5840             else
5841                 CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
5842             break;
5843
5844         case CV_GRAY2BGR565: case CV_GRAY2BGR555:
5845             CV_Assert( scn == 1 && depth == CV_8U );
5846             _dst.create(sz, CV_8UC2);
5847             dst = _dst.getMat();
5848
5849             CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
5850             break;
5851
5852         case CV_BGR2YCrCb: case CV_RGB2YCrCb:
5853         case CV_BGR2YUV: case CV_RGB2YUV:
5854             {
5855             CV_Assert( scn == 3 || scn == 4 );
5856             bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
5857             static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
5858             static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
5859             const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
5860             const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
5861
5862             _dst.create(sz, CV_MAKETYPE(depth, 3));
5863             dst = _dst.getMat();
5864
5865 #if defined HAVE_IPP && 0
5866             CV_IPP_CHECK()
5867             {
5868                 if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
5869                 {
5870                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
5871                     {
5872                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5873                         return;
5874                     }
5875                     setIppErrorStatus();
5876                 }
5877                 else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
5878                 {
5879                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
5880                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
5881                     {
5882                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5883                         return;
5884                     }
5885                     setIppErrorStatus();
5886                 }
5887                 else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
5888                 {
5889                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5890                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
5891                     {
5892                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5893                         return;
5894                     }
5895                     setIppErrorStatus();
5896                 }
5897                 else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
5898                 {
5899                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
5900                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
5901                     {
5902                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5903                         return;
5904                     }
5905                     setIppErrorStatus();
5906                 }
5907             }
5908 #endif
5909
5910             if( depth == CV_8U )
5911             {
5912 #ifdef HAVE_TEGRA_OPTIMIZATION
5913                 if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::cvtRGB2YCrCb(src, dst, bidx))
5914                     break;
5915 #endif
5916                 CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
5917             }
5918             else if( depth == CV_16U )
5919                 CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
5920             else
5921                 CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
5922             }
5923             break;
5924
5925         case CV_YCrCb2BGR: case CV_YCrCb2RGB:
5926         case CV_YUV2BGR: case CV_YUV2RGB:
5927             {
5928             if( dcn <= 0 ) dcn = 3;
5929             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
5930             bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
5931             static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
5932             static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
5933             const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
5934             const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
5935
5936             _dst.create(sz, CV_MAKETYPE(depth, dcn));
5937             dst = _dst.getMat();
5938
5939 #if defined HAVE_IPP && 0
5940             CV_IPP_CHECK()
5941             {
5942                 if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
5943                 {
5944                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
5945                     {
5946                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5947                         return;
5948                     }
5949                     setIppErrorStatus();
5950                 }
5951                 else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
5952                 {
5953                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
5954                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
5955                     {
5956                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5957                         return;
5958                     }
5959                     setIppErrorStatus();
5960                 }
5961                 else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
5962                 {
5963                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
5964                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
5965                     {
5966                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5967                         return;
5968                     }
5969                     setIppErrorStatus();
5970                 }
5971                 else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
5972                 {
5973                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
5974                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
5975                     {
5976                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5977                         return;
5978                     }
5979                     setIppErrorStatus();
5980                 }
5981             }
5982 #endif
5983
5984             if( depth == CV_8U )
5985                 CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
5986             else if( depth == CV_16U )
5987                 CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
5988             else
5989                 CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
5990             }
5991             break;
5992
5993         case CV_BGR2XYZ: case CV_RGB2XYZ:
5994             CV_Assert( scn == 3 || scn == 4 );
5995             bidx = code == CV_BGR2XYZ ? 0 : 2;
5996
5997             _dst.create(sz, CV_MAKETYPE(depth, 3));
5998             dst = _dst.getMat();
5999
6000 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6001             CV_IPP_CHECK()
6002             {
6003                 if( code == CV_BGR2XYZ && scn == 3 && depth != CV_32F )
6004                 {
6005                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
6006                     {
6007                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6008                         return;
6009                     }
6010                     setIppErrorStatus();
6011                 }
6012                 else if( code == CV_BGR2XYZ && scn == 4 && depth != CV_32F )
6013                 {
6014                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
6015                     {
6016                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6017                         return;
6018                     }
6019                     setIppErrorStatus();
6020                 }
6021                 else if( code == CV_RGB2XYZ && scn == 3 && depth != CV_32F )
6022                 {
6023                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
6024                     {
6025                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6026                         return;
6027                     }
6028                     setIppErrorStatus();
6029                 }
6030                 else if( code == CV_RGB2XYZ && scn == 4 && depth != CV_32F )
6031                 {
6032                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
6033                     {
6034                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6035                         return;
6036                     }
6037                     setIppErrorStatus();
6038                 }
6039             }
6040 #endif
6041
6042             if( depth == CV_8U )
6043                 CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
6044             else if( depth == CV_16U )
6045                 CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
6046             else
6047                 CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
6048             break;
6049
6050         case CV_XYZ2BGR: case CV_XYZ2RGB:
6051             if( dcn <= 0 ) dcn = 3;
6052             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
6053             bidx = code == CV_XYZ2BGR ? 0 : 2;
6054
6055             _dst.create(sz, CV_MAKETYPE(depth, dcn));
6056             dst = _dst.getMat();
6057
6058 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6059             CV_IPP_CHECK()
6060             {
6061                 if( code == CV_XYZ2BGR && dcn == 3 && depth != CV_32F )
6062                 {
6063                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6064                     {
6065                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6066                         return;
6067                     }
6068                     setIppErrorStatus();
6069                 }
6070                 else if( code == CV_XYZ2BGR && dcn == 4 && depth != CV_32F )
6071                 {
6072                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6073                     {
6074                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6075                         return;
6076                     }
6077                     setIppErrorStatus();
6078                 }
6079                 if( code == CV_XYZ2RGB && dcn == 3 && depth != CV_32F )
6080                 {
6081                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
6082                     {
6083                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6084                         return;
6085                     }
6086                     setIppErrorStatus();
6087                 }
6088                 else if( code == CV_XYZ2RGB && dcn == 4 && depth != CV_32F )
6089                 {
6090                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6091                     {
6092                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6093                         return;
6094                     }
6095                     setIppErrorStatus();
6096                 }
6097             }
6098 #endif
6099
6100             if( depth == CV_8U )
6101                 CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
6102             else if( depth == CV_16U )
6103                 CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
6104             else
6105                 CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
6106             break;
6107
6108         case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
6109         case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
6110             {
6111             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
6112             bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
6113                 code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
6114             int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
6115                 code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
6116
6117             _dst.create(sz, CV_MAKETYPE(depth, 3));
6118             dst = _dst.getMat();
6119
6120 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6121             CV_IPP_CHECK()
6122             {
6123                 if( depth == CV_8U || depth == CV_16U )
6124                 {
6125 #if 0 // breaks OCL accuracy tests
6126                     if( code == CV_BGR2HSV_FULL && scn == 3 )
6127                     {
6128                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
6129                         {
6130                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6131                             return;
6132                         }
6133                         setIppErrorStatus();
6134                     }
6135                     else if( code == CV_BGR2HSV_FULL && scn == 4 )
6136                     {
6137                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
6138                         {
6139                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6140                             return;
6141                         }
6142                         setIppErrorStatus();
6143                     }
6144                     else if( code == CV_RGB2HSV_FULL && scn == 4 )
6145                     {
6146                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
6147                         {
6148                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6149                             return;
6150                         }
6151                         setIppErrorStatus();
6152                     } else
6153 #endif
6154                     if( code == CV_RGB2HSV_FULL && scn == 3 && depth == CV_16U )
6155                     {
6156                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HSVTab[depth])) )
6157                         {
6158                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6159                             return;
6160                         }
6161                         setIppErrorStatus();
6162                     }
6163                     else if( code == CV_BGR2HLS_FULL && scn == 3 )
6164                     {
6165                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
6166                         {
6167                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6168                             return;
6169                         }
6170                         setIppErrorStatus();
6171                     }
6172                     else if( code == CV_BGR2HLS_FULL && scn == 4 )
6173                     {
6174                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
6175                         {
6176                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6177                             return;
6178                         }
6179                         setIppErrorStatus();
6180                     }
6181                     else if( code == CV_RGB2HLS_FULL && scn == 3 )
6182                     {
6183                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
6184                         {
6185                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6186                             return;
6187                         }
6188                         setIppErrorStatus();
6189                     }
6190                     else if( code == CV_RGB2HLS_FULL && scn == 4 )
6191                     {
6192                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
6193                         {
6194                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6195                             return;
6196                         }
6197                         setIppErrorStatus();
6198                     }
6199                 }
6200             }
6201 #endif
6202
6203             if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
6204                 code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
6205             {
6206 #ifdef HAVE_TEGRA_OPTIMIZATION
6207                 if(tegra::cvtRGB2HSV(src, dst, bidx, hrange))
6208                     break;
6209 #endif
6210                 if( depth == CV_8U )
6211                     CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
6212                 else
6213                     CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
6214             }
6215             else
6216             {
6217                 if( depth == CV_8U )
6218                     CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
6219                 else
6220                     CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
6221             }
6222             }
6223             break;
6224
6225         case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
6226         case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
6227             {
6228             if( dcn <= 0 ) dcn = 3;
6229             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
6230             bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
6231                 code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
6232             int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
6233                 code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
6234
6235             _dst.create(sz, CV_MAKETYPE(depth, dcn));
6236             dst = _dst.getMat();
6237
6238 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
6239             CV_IPP_CHECK()
6240             {
6241                 if( depth == CV_8U || depth == CV_16U )
6242                 {
6243                     if( code == CV_HSV2BGR_FULL && dcn == 3 )
6244                     {
6245                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6246                         {
6247                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6248                             return;
6249                         }
6250                         setIppErrorStatus();
6251                     }
6252                     else if( code == CV_HSV2BGR_FULL && dcn == 4 )
6253                     {
6254                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6255                         {
6256                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6257                             return;
6258                         }
6259                         setIppErrorStatus();
6260                     }
6261                     else if( code == CV_HSV2RGB_FULL && dcn == 3 )
6262                     {
6263                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
6264                         {
6265                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6266                             return;
6267                         }
6268                         setIppErrorStatus();
6269                     }
6270                     else if( code == CV_HSV2RGB_FULL && dcn == 4 )
6271                     {
6272                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6273                         {
6274                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6275                             return;
6276                         }
6277                         setIppErrorStatus();
6278                     }
6279                     else if( code == CV_HLS2BGR_FULL && dcn == 3 )
6280                     {
6281                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6282                         {
6283                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6284                             return;
6285                         }
6286                         setIppErrorStatus();
6287                     }
6288                     else if( code == CV_HLS2BGR_FULL && dcn == 4 )
6289                     {
6290                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6291                         {
6292                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6293                             return;
6294                         }
6295                         setIppErrorStatus();
6296                     }
6297                     else if( code == CV_HLS2RGB_FULL && dcn == 3 )
6298                     {
6299                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
6300                         {
6301                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6302                             return;
6303                         }
6304                         setIppErrorStatus();
6305                     }
6306                     else if( code == CV_HLS2RGB_FULL && dcn == 4 )
6307                     {
6308                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6309                         {
6310                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6311                             return;
6312                         }
6313                         setIppErrorStatus();
6314                     }
6315                 }
6316             }
6317 #endif
6318
6319             if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
6320                 code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
6321             {
6322                 if( depth == CV_8U )
6323                     CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
6324                 else
6325                     CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
6326             }
6327             else
6328             {
6329                 if( depth == CV_8U )
6330                     CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
6331                 else
6332                     CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
6333             }
6334             }
6335             break;
6336
6337         case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
6338         case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
6339             {
6340             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
6341             bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
6342                    code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
6343             bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
6344                         code == CV_BGR2Luv || code == CV_RGB2Luv;
6345
6346             _dst.create(sz, CV_MAKETYPE(depth, 3));
6347             dst = _dst.getMat();
6348
6349 #if defined HAVE_IPP && 0
6350             CV_IPP_CHECK()
6351             {
6352                 if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
6353                 {
6354                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
6355                     {
6356                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6357                         return;
6358                     }
6359                     setIppErrorStatus();
6360                 }
6361                 else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
6362                 {
6363                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6364                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
6365                     {
6366                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6367                         return;
6368                     }
6369                     setIppErrorStatus();
6370                 }
6371                 else
6372                 if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U) // slower than OpenCV
6373                 {
6374                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
6375                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
6376                     {
6377                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6378                         return;
6379                     }
6380                     setIppErrorStatus();
6381                 }
6382                 else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U) // slower than OpenCV
6383                 {
6384                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6385                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
6386                     {
6387                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6388                         return;
6389                     }
6390                     setIppErrorStatus();
6391                 }
6392                 else if (code == CV_LRGB2Luv && scn == 3)
6393                 {
6394                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
6395                     {
6396                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6397                         return;
6398                     }
6399                     setIppErrorStatus();
6400                 }
6401                 else if (code == CV_LRGB2Luv && scn == 4)
6402                 {
6403                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6404                                                                            ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
6405                     {
6406                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6407                         return;
6408                     }
6409                     setIppErrorStatus();
6410                 }
6411                 else if (code == CV_LBGR2Luv && scn == 3)
6412                 {
6413                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
6414                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
6415                     {
6416                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6417                         return;
6418                     }
6419                     setIppErrorStatus();
6420                 }
6421                 else if (code == CV_LBGR2Luv && scn == 4)
6422                 {
6423                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
6424                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
6425                     {
6426                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6427                         return;
6428                     }
6429                     setIppErrorStatus();
6430                 }
6431             }
6432 #endif
6433
6434             if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
6435                 code == CV_LBGR2Lab || code == CV_LRGB2Lab )
6436             {
6437                 if( depth == CV_8U )
6438                     CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
6439                 else
6440                     CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
6441             }
6442             else
6443             {
6444                 if( depth == CV_8U )
6445                     CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
6446                 else
6447                     CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
6448             }
6449             }
6450             break;
6451
6452         case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
6453         case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
6454             {
6455             if( dcn <= 0 ) dcn = 3;
6456             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
6457             bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
6458                    code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
6459             bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
6460                     code == CV_Luv2BGR || code == CV_Luv2RGB;
6461
6462             _dst.create(sz, CV_MAKETYPE(depth, dcn));
6463             dst = _dst.getMat();
6464
6465 #if defined HAVE_IPP && 0
6466             CV_IPP_CHECK()
6467             {
6468                 if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
6469                 {
6470                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
6471                     {
6472                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6473                         return;
6474                     }
6475                     setIppErrorStatus();
6476                 }
6477                 else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
6478                 {
6479                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
6480                                         ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6481                     {
6482                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6483                         return;
6484                     }
6485                     setIppErrorStatus();
6486                 }
6487                 if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
6488                 {
6489                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
6490                                                                                ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6491                     {
6492                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6493                         return;
6494                     }
6495                     setIppErrorStatus();
6496                 }
6497                 else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
6498                 {
6499                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
6500                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6501                     {
6502                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6503                         return;
6504                     }
6505                     setIppErrorStatus();
6506                 }
6507                 if( code == CV_Luv2LRGB && dcn == 3 )
6508                 {
6509                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
6510                         return;
6511                 }
6512                 else if( code == CV_Luv2LRGB && dcn == 4 )
6513                 {
6514                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
6515                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
6516                     {
6517                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6518                         return;
6519                     }
6520                 }
6521                 if( code == CV_Luv2LBGR && dcn == 3 )
6522                 {
6523                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
6524                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
6525                     {
6526                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6527                         return;
6528                     }
6529                 }
6530                 else if( code == CV_Luv2LBGR && dcn == 4 )
6531                 {
6532                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
6533                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
6534                     {
6535                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6536                         return;
6537                     }
6538                 }
6539             }
6540 #endif
6541
6542             if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
6543                 code == CV_Lab2LBGR || code == CV_Lab2LRGB )
6544             {
6545                 if( depth == CV_8U )
6546                     CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
6547                 else
6548                     CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
6549             }
6550             else
6551             {
6552                 if( depth == CV_8U )
6553                     CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
6554                 else
6555                     CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
6556             }
6557             }
6558             break;
6559
6560         case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
6561         case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
6562         case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
6563         case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
6564             demosaicing(src, _dst, code, dcn);
6565             break;
6566
6567         case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
6568         case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
6569             {
6570                 // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
6571                 // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
6572
6573                 if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
6574                 const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
6575                 const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
6576
6577                 CV_Assert( dcn == 3 || dcn == 4 );
6578                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6579
6580                 Size dstSz(sz.width, sz.height * 2 / 3);
6581                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6582                 dst = _dst.getMat();
6583
6584                 int srcstep = (int)src.step;
6585                 const uchar* y = src.ptr();
6586                 const uchar* uv = y + srcstep * dstSz.height;
6587
6588                 switch(dcn*100 + bIdx * 10 + uIdx)
6589                 {
6590                     case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
6591                     case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
6592                     case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
6593                     case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
6594                     case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
6595                     case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
6596                     case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
6597                     case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
6598                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6599                 };
6600             }
6601             break;
6602         case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
6603         case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
6604             {
6605                 //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
6606                 //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
6607
6608                 if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
6609                 const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
6610                 const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
6611
6612                 CV_Assert( dcn == 3 || dcn == 4 );
6613                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6614
6615                 Size dstSz(sz.width, sz.height * 2 / 3);
6616                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6617                 dst = _dst.getMat();
6618
6619                 int srcstep = (int)src.step;
6620                 const uchar* y = src.ptr();
6621                 const uchar* u = y + srcstep * dstSz.height;
6622                 const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
6623
6624                 int ustepIdx = 0;
6625                 int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
6626
6627                 if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
6628
6629                 switch(dcn*10 + bIdx)
6630                 {
6631                     case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6632                     case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6633                     case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6634                     case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
6635                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6636                 };
6637             }
6638             break;
6639         case CV_YUV2GRAY_420:
6640             {
6641                 if (dcn <= 0) dcn = 1;
6642
6643                 CV_Assert( dcn == 1 );
6644                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6645
6646                 Size dstSz(sz.width, sz.height * 2 / 3);
6647                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6648                 dst = _dst.getMat();
6649 #if defined HAVE_IPP
6650                 CV_IPP_CHECK()
6651                 {
6652                     if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
6653                             ippiSize(dstSz.width, dstSz.height)))
6654                     {
6655                         CV_IMPL_ADD(CV_IMPL_IPP);
6656                         return;
6657                     }
6658                     setIppErrorStatus();
6659                 }
6660 #endif
6661                 src(Range(0, dstSz.height), Range::all()).copyTo(dst);
6662             }
6663             break;
6664         case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
6665         case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
6666             {
6667                 if (dcn <= 0) dcn = 1;
6668                 const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
6669                 const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
6670
6671                 CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
6672                 CV_Assert( dcn == 1 );
6673                 CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
6674
6675                 Size dstSz(sz.width, sz.height / 2 * 3);
6676                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6677                 dst = _dst.getMat();
6678
6679                 switch(bIdx + uIdx*10)
6680                 {
6681                     case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
6682                     case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
6683                     case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
6684                     case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
6685                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6686                 };
6687             }
6688             break;
6689         case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
6690         case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
6691         case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
6692             {
6693                 //http://www.fourcc.org/yuv.php#UYVY
6694                 //http://www.fourcc.org/yuv.php#YUY2
6695                 //http://www.fourcc.org/yuv.php#YVYU
6696
6697                 if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
6698                 const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
6699                 const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
6700                 const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
6701
6702                 CV_Assert( dcn == 3 || dcn == 4 );
6703                 CV_Assert( scn == 2 && depth == CV_8U );
6704
6705                 _dst.create(sz, CV_8UC(dcn));
6706                 dst = _dst.getMat();
6707
6708                 switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
6709                 {
6710                     case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6711                     case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6712                     case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6713                     case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6714                     case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6715                     case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6716                     case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6717                     case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6718                     case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6719                     case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6720                     case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6721                     case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6722                     case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6723                     case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6724                     case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
6725                     case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
6726                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
6727                 };
6728             }
6729             break;
6730         case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
6731             {
6732                 if (dcn <= 0) dcn = 1;
6733
6734                 CV_Assert( dcn == 1 );
6735                 CV_Assert( scn == 2 && depth == CV_8U );
6736
6737                 extractChannel(_src, _dst, code == CV_YUV2GRAY_UYVY ? 1 : 0);
6738             }
6739             break;
6740         case CV_RGBA2mRGBA:
6741             {
6742                 if (dcn <= 0) dcn = 4;
6743                 CV_Assert( scn == 4 && dcn == 4 );
6744
6745                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
6746                 dst = _dst.getMat();
6747
6748                 if( depth == CV_8U )
6749                 {
6750 #if defined(HAVE_IPP)
6751                     CV_IPP_CHECK()
6752                     {
6753                         if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
6754                         {
6755                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
6756                             return;
6757                         }
6758                         setIppErrorStatus();
6759                     }
6760 #endif
6761                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
6762                 }
6763                 else
6764                 {
6765                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
6766                 }
6767             }
6768             break;
6769         case CV_mRGBA2RGBA:
6770             {
6771                 if (dcn <= 0) dcn = 4;
6772                 CV_Assert( scn == 4 && dcn == 4 );
6773
6774                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
6775                 dst = _dst.getMat();
6776
6777                 if( depth == CV_8U )
6778                     CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
6779                 else
6780                 {
6781                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
6782                 }
6783             }
6784             break;
6785         default:
6786             CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
6787     }
6788 }
6789
6790 CV_IMPL void
6791 cvCvtColor( const CvArr* srcarr, CvArr* dstarr, int code )
6792 {
6793     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
6794     CV_Assert( src.depth() == dst.depth() );
6795
6796     cv::cvtColor(src, dst, code, dst.channels());
6797     CV_Assert( dst.data == dst0.data );
6798 }
6799
6800
6801 /* End of file. */