Added multithreaded implementation for RGB to YUV420p color conversion
[profile/ivi/opencv.git] / modules / imgproc / src / color.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 /********************************* COPYRIGHT NOTICE *******************************\
44   The function for RGB to Lab conversion is based on the MATLAB script
45   RGB2Lab.m translated by Mark Ruzon from C code by Yossi Rubner, 23 September 1997.
46   See the page [http://vision.stanford.edu/~ruzon/software/rgblab.html]
47 \**********************************************************************************/
48
49 /********************************* COPYRIGHT NOTICE *******************************\
50   Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
51   from MD-Mathematische Dienste GmbH. Below is the copyright notice:
52
53     IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
54     By downloading, copying, installing or using the software you agree
55     to this license. If you do not agree to this license, do not download,
56     install, copy or use the software.
57
58     Contributors License Agreement:
59
60       Copyright (c) 2002,
61       MD-Mathematische Dienste GmbH
62       Im Defdahl 5-10
63       44141 Dortmund
64       Germany
65       www.md-it.de
66
67     Redistribution and use in source and binary forms,
68     with or without modification, are permitted provided
69     that the following conditions are met:
70
71     Redistributions of source code must retain
72     the above copyright notice, this list of conditions and the following disclaimer.
73     Redistributions in binary form must reproduce the above copyright notice,
74     this list of conditions and the following disclaimer in the documentation
75     and/or other materials provided with the distribution.
76     The name of Contributor may not be used to endorse or promote products
77     derived from this software without specific prior written permission.
78
79     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
80     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
81     THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
82     PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
83     FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
84     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
85     OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
86     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
87     STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
88     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
89     THE POSSIBILITY OF SUCH DAMAGE.
90 \**********************************************************************************/
91
92 #include "precomp.hpp"
93 #include <limits>
94 #include <iostream>
95
96 namespace cv
97 {
98
99 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
100 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
101 {
102     _Tp cn = 0;
103     int i;
104     tab[0] = tab[1] = (_Tp)0;
105
106     for(i = 1; i < n-1; i++)
107     {
108         _Tp t = 3*(f[i+1] - 2*f[i] + f[i-1]);
109         _Tp l = 1/(4 - tab[(i-1)*4]);
110         tab[i*4] = l; tab[i*4+1] = (t - tab[(i-1)*4+1])*l;
111     }
112
113     for(i = n-1; i >= 0; i--)
114     {
115         _Tp c = tab[i*4+1] - tab[i*4]*cn;
116         _Tp b = f[i+1] - f[i] - (cn + c*2)*(_Tp)0.3333333333333333;
117         _Tp d = (cn - c)*(_Tp)0.3333333333333333;
118         tab[i*4] = f[i]; tab[i*4+1] = b;
119         tab[i*4+2] = c; tab[i*4+3] = d;
120         cn = c;
121     }
122 }
123
124 // interpolates value of a function at x, 0 <= x <= n using a cubic spline.
125 template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n)
126 {
127     // don't touch this function without urgent need - some versions of gcc fail to inline it correctly
128     int ix = std::min(std::max(int(x), 0), n-1);
129     x -= ix;
130     tab += ix*4;
131     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
132 }
133
134
135 template<typename _Tp> struct ColorChannel
136 {
137     typedef float worktype_f;
138     static _Tp max() { return std::numeric_limits<_Tp>::max(); }
139     static _Tp half() { return (_Tp)(max()/2 + 1); }
140 };
141
142 template<> struct ColorChannel<float>
143 {
144     typedef float worktype_f;
145     static float max() { return 1.f; }
146     static float half() { return 0.5f; }
147 };
148
149 /*template<> struct ColorChannel<double>
150 {
151     typedef double worktype_f;
152     static double max() { return 1.; }
153     static double half() { return 0.5; }
154 };*/
155
156
157 ///////////////////////////// Top-level template function ////////////////////////////////
158
159 template <typename Cvt>
160 class CvtColorLoop_Invoker : public ParallelLoopBody
161 {
162     typedef typename Cvt::channel_type _Tp;
163 public:
164
165     CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
166         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
167     {
168     }
169
170     virtual void operator()(const Range& range) const
171     {
172         const uchar* yS = src.ptr<uchar>(range.start);
173         uchar* yD = dst.ptr<uchar>(range.start);
174
175         for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
176             cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
177     }
178
179 private:
180     const Mat& src;
181     Mat& dst;
182     const Cvt& cvt;
183
184     const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
185 };
186
187 template <typename Cvt>
188 void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
189 {
190     parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
191 }
192
193 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
194
195 template<typename _Tp> struct RGB2RGB
196 {
197     typedef _Tp channel_type;
198
199     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
200     void operator()(const _Tp* src, _Tp* dst, int n) const
201     {
202         int scn = srccn, dcn = dstcn, bidx = blueIdx;
203         if( dcn == 3 )
204         {
205             n *= 3;
206             for( int i = 0; i < n; i += 3, src += scn )
207             {
208                 _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
209                 dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
210             }
211         }
212         else if( scn == 3 )
213         {
214             n *= 3;
215             _Tp alpha = ColorChannel<_Tp>::max();
216             for( int i = 0; i < n; i += 3, dst += 4 )
217             {
218                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
219                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
220             }
221         }
222         else
223         {
224             n *= 4;
225             for( int i = 0; i < n; i += 4 )
226             {
227                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
228                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
229             }
230         }
231     }
232
233     int srccn, dstcn, blueIdx;
234 };
235
236 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
237
238 struct RGB5x52RGB
239 {
240     typedef uchar channel_type;
241
242     RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
243         : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) {}
244
245     void operator()(const uchar* src, uchar* dst, int n) const
246     {
247         int dcn = dstcn, bidx = blueIdx;
248         if( greenBits == 6 )
249             for( int i = 0; i < n; i++, dst += dcn )
250             {
251                 unsigned t = ((const ushort*)src)[i];
252                 dst[bidx] = (uchar)(t << 3);
253                 dst[1] = (uchar)((t >> 3) & ~3);
254                 dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
255                 if( dcn == 4 )
256                     dst[3] = 255;
257             }
258         else
259             for( int i = 0; i < n; i++, dst += dcn )
260             {
261                 unsigned t = ((const ushort*)src)[i];
262                 dst[bidx] = (uchar)(t << 3);
263                 dst[1] = (uchar)((t >> 2) & ~7);
264                 dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
265                 if( dcn == 4 )
266                     dst[3] = t & 0x8000 ? 255 : 0;
267             }
268     }
269
270     int dstcn, blueIdx, greenBits;
271 };
272
273
274 struct RGB2RGB5x5
275 {
276     typedef uchar channel_type;
277
278     RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
279         : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) {}
280
281     void operator()(const uchar* src, uchar* dst, int n) const
282     {
283         int scn = srccn, bidx = blueIdx;
284         if( greenBits == 6 )
285             for( int i = 0; i < n; i++, src += scn )
286             {
287                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
288             }
289         else if( scn == 3 )
290             for( int i = 0; i < n; i++, src += 3 )
291             {
292                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
293             }
294         else
295             for( int i = 0; i < n; i++, src += 4 )
296             {
297                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
298                     ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
299             }
300     }
301
302     int srccn, blueIdx, greenBits;
303 };
304
305 ///////////////////////////////// Color to/from Grayscale ////////////////////////////////
306
307 template<typename _Tp>
308 struct Gray2RGB
309 {
310     typedef _Tp channel_type;
311
312     Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
313     void operator()(const _Tp* src, _Tp* dst, int n) const
314     {
315         if( dstcn == 3 )
316             for( int i = 0; i < n; i++, dst += 3 )
317             {
318                 dst[0] = dst[1] = dst[2] = src[i];
319             }
320         else
321         {
322             _Tp alpha = ColorChannel<_Tp>::max();
323             for( int i = 0; i < n; i++, dst += 4 )
324             {
325                 dst[0] = dst[1] = dst[2] = src[i];
326                 dst[3] = alpha;
327             }
328         }
329     }
330
331     int dstcn;
332 };
333
334
335 struct Gray2RGB5x5
336 {
337     typedef uchar channel_type;
338
339     Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) {}
340     void operator()(const uchar* src, uchar* dst, int n) const
341     {
342         if( greenBits == 6 )
343             for( int i = 0; i < n; i++ )
344             {
345                 int t = src[i];
346                 ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
347             }
348         else
349             for( int i = 0; i < n; i++ )
350             {
351                 int t = src[i] >> 3;
352                 ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
353             }
354     }
355     int greenBits;
356 };
357
358
359 #undef R2Y
360 #undef G2Y
361 #undef B2Y
362
363 enum
364 {
365     yuv_shift = 14,
366     xyz_shift = 12,
367     R2Y = 4899,
368     G2Y = 9617,
369     B2Y = 1868,
370     BLOCK_SIZE = 256
371 };
372
373
374 struct RGB5x52Gray
375 {
376     typedef uchar channel_type;
377
378     RGB5x52Gray(int _greenBits) : greenBits(_greenBits) {}
379     void operator()(const uchar* src, uchar* dst, int n) const
380     {
381         if( greenBits == 6 )
382             for( int i = 0; i < n; i++ )
383             {
384                 int t = ((ushort*)src)[i];
385                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
386                                            ((t >> 3) & 0xfc)*G2Y +
387                                            ((t >> 8) & 0xf8)*R2Y, yuv_shift);
388             }
389         else
390             for( int i = 0; i < n; i++ )
391             {
392                 int t = ((ushort*)src)[i];
393                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
394                                            ((t >> 2) & 0xf8)*G2Y +
395                                            ((t >> 7) & 0xf8)*R2Y, yuv_shift);
396             }
397     }
398     int greenBits;
399 };
400
401
402 template<typename _Tp> struct RGB2Gray
403 {
404     typedef _Tp channel_type;
405
406     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
407     {
408         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
409         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
410         if(blueIdx == 0)
411             std::swap(coeffs[0], coeffs[2]);
412     }
413
414     void operator()(const _Tp* src, _Tp* dst, int n) const
415     {
416         int scn = srccn;
417         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
418         for(int i = 0; i < n; i++, src += scn)
419             dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
420     }
421     int srccn;
422     float coeffs[3];
423 };
424
425
426 template<> struct RGB2Gray<uchar>
427 {
428     typedef uchar channel_type;
429
430     RGB2Gray<uchar>(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
431     {
432         const int coeffs0[] = { R2Y, G2Y, B2Y };
433         if(!coeffs) coeffs = coeffs0;
434
435         int b = 0, g = 0, r = (1 << (yuv_shift-1));
436         int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
437
438         for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
439         {
440             tab[i] = b;
441             tab[i+256] = g;
442             tab[i+512] = r;
443         }
444     }
445     void operator()(const uchar* src, uchar* dst, int n) const
446     {
447         int scn = srccn;
448         const int* _tab = tab;
449         for(int i = 0; i < n; i++, src += scn)
450             dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
451     }
452     int srccn;
453     int tab[256*3];
454 };
455
456
457 template<> struct RGB2Gray<ushort>
458 {
459     typedef ushort channel_type;
460
461     RGB2Gray<ushort>(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
462     {
463         static const int coeffs0[] = { R2Y, G2Y, B2Y };
464         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
465         if( blueIdx == 0 )
466             std::swap(coeffs[0], coeffs[2]);
467     }
468
469     void operator()(const ushort* src, ushort* dst, int n) const
470     {
471         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
472         for(int i = 0; i < n; i++, src += scn)
473             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
474     }
475     int srccn;
476     int coeffs[3];
477 };
478
479
480 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
481
482 template<typename _Tp> struct RGB2YCrCb_f
483 {
484     typedef _Tp channel_type;
485
486     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
487     {
488         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
489         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
490         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
491     }
492
493     void operator()(const _Tp* src, _Tp* dst, int n) const
494     {
495         int scn = srccn, bidx = blueIdx;
496         const _Tp delta = ColorChannel<_Tp>::half();
497         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
498         n *= 3;
499         for(int i = 0; i < n; i += 3, src += scn)
500         {
501             _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
502             _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
503             _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
504             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
505         }
506     }
507     int srccn, blueIdx;
508     float coeffs[5];
509 };
510
511
512 template<typename _Tp> struct RGB2YCrCb_i
513 {
514     typedef _Tp channel_type;
515
516     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
517         : srccn(_srccn), blueIdx(_blueIdx)
518     {
519         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
520         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
521         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
522     }
523     void operator()(const _Tp* src, _Tp* dst, int n) const
524     {
525         int scn = srccn, bidx = blueIdx;
526         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
527         int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
528         n *= 3;
529         for(int i = 0; i < n; i += 3, src += scn)
530         {
531             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
532             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
533             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
534             dst[i] = saturate_cast<_Tp>(Y);
535             dst[i+1] = saturate_cast<_Tp>(Cr);
536             dst[i+2] = saturate_cast<_Tp>(Cb);
537         }
538     }
539     int srccn, blueIdx;
540     int coeffs[5];
541 };
542
543
544 template<typename _Tp> struct YCrCb2RGB_f
545 {
546     typedef _Tp channel_type;
547
548     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
549         : dstcn(_dstcn), blueIdx(_blueIdx)
550     {
551         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
552         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
553     }
554     void operator()(const _Tp* src, _Tp* dst, int n) const
555     {
556         int dcn = dstcn, bidx = blueIdx;
557         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
558         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
559         n *= 3;
560         for(int i = 0; i < n; i += 3, dst += dcn)
561         {
562             _Tp Y = src[i];
563             _Tp Cr = src[i+1];
564             _Tp Cb = src[i+2];
565
566             _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
567             _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
568             _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
569
570             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
571             if( dcn == 4 )
572                 dst[3] = alpha;
573         }
574     }
575     int dstcn, blueIdx;
576     float coeffs[4];
577 };
578
579
580 template<typename _Tp> struct YCrCb2RGB_i
581 {
582     typedef _Tp channel_type;
583
584     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
585         : dstcn(_dstcn), blueIdx(_blueIdx)
586     {
587         static const int coeffs0[] = {22987, -11698, -5636, 29049};
588         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
589     }
590
591     void operator()(const _Tp* src, _Tp* dst, int n) const
592     {
593         int dcn = dstcn, bidx = blueIdx;
594         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
595         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
596         n *= 3;
597         for(int i = 0; i < n; i += 3, dst += dcn)
598         {
599             _Tp Y = src[i];
600             _Tp Cr = src[i+1];
601             _Tp Cb = src[i+2];
602
603             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
604             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
605             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
606
607             dst[bidx] = saturate_cast<_Tp>(b);
608             dst[1] = saturate_cast<_Tp>(g);
609             dst[bidx^2] = saturate_cast<_Tp>(r);
610             if( dcn == 4 )
611                 dst[3] = alpha;
612         }
613     }
614     int dstcn, blueIdx;
615     int coeffs[4];
616 };
617
618
619 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
620
621 static const float sRGB2XYZ_D65[] =
622 {
623     0.412453f, 0.357580f, 0.180423f,
624     0.212671f, 0.715160f, 0.072169f,
625     0.019334f, 0.119193f, 0.950227f
626 };
627
628 static const float XYZ2sRGB_D65[] =
629 {
630     3.240479f, -1.53715f, -0.498535f,
631     -0.969256f, 1.875991f, 0.041556f,
632     0.055648f, -0.204043f, 1.057311f
633 };
634
635 template<typename _Tp> struct RGB2XYZ_f
636 {
637     typedef _Tp channel_type;
638
639     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
640     {
641         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
642         if(blueIdx == 0)
643         {
644             std::swap(coeffs[0], coeffs[2]);
645             std::swap(coeffs[3], coeffs[5]);
646             std::swap(coeffs[6], coeffs[8]);
647         }
648     }
649     void operator()(const _Tp* src, _Tp* dst, int n) const
650     {
651         int scn = srccn;
652         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
653               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
654               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
655
656         n *= 3;
657         for(int i = 0; i < n; i += 3, src += scn)
658         {
659             _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
660             _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
661             _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
662             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
663         }
664     }
665     int srccn;
666     float coeffs[9];
667 };
668
669
670 template<typename _Tp> struct RGB2XYZ_i
671 {
672     typedef _Tp channel_type;
673
674     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
675     {
676         static const int coeffs0[] =
677         {
678             1689,    1465,    739,
679             871,     2929,    296,
680             79,      488,     3892
681         };
682         for( int i = 0; i < 9; i++ )
683             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
684         if(blueIdx == 0)
685         {
686             std::swap(coeffs[0], coeffs[2]);
687             std::swap(coeffs[3], coeffs[5]);
688             std::swap(coeffs[6], coeffs[8]);
689         }
690     }
691     void operator()(const _Tp* src, _Tp* dst, int n) const
692     {
693         int scn = srccn;
694         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
695             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
696             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
697         n *= 3;
698         for(int i = 0; i < n; i += 3, src += scn)
699         {
700             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
701             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
702             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
703             dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
704             dst[i+2] = saturate_cast<_Tp>(Z);
705         }
706     }
707     int srccn;
708     int coeffs[9];
709 };
710
711
712 template<typename _Tp> struct XYZ2RGB_f
713 {
714     typedef _Tp channel_type;
715
716     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
717     : dstcn(_dstcn), blueIdx(_blueIdx)
718     {
719         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
720         if(blueIdx == 0)
721         {
722             std::swap(coeffs[0], coeffs[6]);
723             std::swap(coeffs[1], coeffs[7]);
724             std::swap(coeffs[2], coeffs[8]);
725         }
726     }
727
728     void operator()(const _Tp* src, _Tp* dst, int n) const
729     {
730         int dcn = dstcn;
731         _Tp alpha = ColorChannel<_Tp>::max();
732         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
733               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
734               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
735         n *= 3;
736         for(int i = 0; i < n; i += 3, dst += dcn)
737         {
738             _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
739             _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
740             _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
741             dst[0] = B; dst[1] = G; dst[2] = R;
742             if( dcn == 4 )
743                 dst[3] = alpha;
744         }
745     }
746     int dstcn, blueIdx;
747     float coeffs[9];
748 };
749
750
751 template<typename _Tp> struct XYZ2RGB_i
752 {
753     typedef _Tp channel_type;
754
755     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
756     : dstcn(_dstcn), blueIdx(_blueIdx)
757     {
758         static const int coeffs0[] =
759         {
760             13273,  -6296,  -2042,
761             -3970,   7684,    170,
762               228,   -836,   4331
763         };
764         for(int i = 0; i < 9; i++)
765             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
766
767         if(blueIdx == 0)
768         {
769             std::swap(coeffs[0], coeffs[6]);
770             std::swap(coeffs[1], coeffs[7]);
771             std::swap(coeffs[2], coeffs[8]);
772         }
773     }
774     void operator()(const _Tp* src, _Tp* dst, int n) const
775     {
776         int dcn = dstcn;
777         _Tp alpha = ColorChannel<_Tp>::max();
778         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
779             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
780             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
781         n *= 3;
782         for(int i = 0; i < n; i += 3, dst += dcn)
783         {
784             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
785             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
786             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
787             dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
788             dst[2] = saturate_cast<_Tp>(R);
789             if( dcn == 4 )
790                 dst[3] = alpha;
791         }
792     }
793     int dstcn, blueIdx;
794     int coeffs[9];
795 };
796
797
798 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
799
800
801 struct RGB2HSV_b
802 {
803     typedef uchar channel_type;
804
805     RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
806     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
807     {
808         CV_Assert( hrange == 180 || hrange == 256 );
809     }
810
811     void operator()(const uchar* src, uchar* dst, int n) const
812     {
813         int i, bidx = blueIdx, scn = srccn;
814         const int hsv_shift = 12;
815
816         static int sdiv_table[256];
817         static int hdiv_table180[256];
818         static int hdiv_table256[256];
819         static volatile bool initialized = false;
820
821         int hr = hrange;
822         const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
823         n *= 3;
824
825         if( !initialized )
826         {
827             sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
828             for( i = 1; i < 256; i++ )
829             {
830                 sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
831                 hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
832                 hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
833             }
834             initialized = true;
835         }
836
837         for( i = 0; i < n; i += 3, src += scn )
838         {
839             int b = src[bidx], g = src[1], r = src[bidx^2];
840             int h, s, v = b;
841             int vmin = b, diff;
842             int vr, vg;
843
844             CV_CALC_MAX_8U( v, g );
845             CV_CALC_MAX_8U( v, r );
846             CV_CALC_MIN_8U( vmin, g );
847             CV_CALC_MIN_8U( vmin, r );
848
849             diff = v - vmin;
850             vr = v == r ? -1 : 0;
851             vg = v == g ? -1 : 0;
852
853             s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
854             h = (vr & (g - b)) +
855                 (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
856             h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
857             h += h < 0 ? hr : 0;
858
859             dst[i] = saturate_cast<uchar>(h);
860             dst[i+1] = (uchar)s;
861             dst[i+2] = (uchar)v;
862         }
863     }
864
865     int srccn, blueIdx, hrange;
866 };
867
868
869 struct RGB2HSV_f
870 {
871     typedef float channel_type;
872
873     RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
874     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
875
876     void operator()(const float* src, float* dst, int n) const
877     {
878         int i, bidx = blueIdx, scn = srccn;
879         float hscale = hrange*(1.f/360.f);
880         n *= 3;
881
882         for( i = 0; i < n; i += 3, src += scn )
883         {
884             float b = src[bidx], g = src[1], r = src[bidx^2];
885             float h, s, v;
886
887             float vmin, diff;
888
889             v = vmin = r;
890             if( v < g ) v = g;
891             if( v < b ) v = b;
892             if( vmin > g ) vmin = g;
893             if( vmin > b ) vmin = b;
894
895             diff = v - vmin;
896             s = diff/(float)(fabs(v) + FLT_EPSILON);
897             diff = (float)(60./(diff + FLT_EPSILON));
898             if( v == r )
899                 h = (g - b)*diff;
900             else if( v == g )
901                 h = (b - r)*diff + 120.f;
902             else
903                 h = (r - g)*diff + 240.f;
904
905             if( h < 0 ) h += 360.f;
906
907             dst[i] = h*hscale;
908             dst[i+1] = s;
909             dst[i+2] = v;
910         }
911     }
912
913     int srccn, blueIdx;
914     float hrange;
915 };
916
917
918 struct HSV2RGB_f
919 {
920     typedef float channel_type;
921
922     HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
923     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
924
925     void operator()(const float* src, float* dst, int n) const
926     {
927         int i, bidx = blueIdx, dcn = dstcn;
928         float _hscale = hscale;
929         float alpha = ColorChannel<float>::max();
930         n *= 3;
931
932         for( i = 0; i < n; i += 3, dst += dcn )
933         {
934             float h = src[i], s = src[i+1], v = src[i+2];
935             float b, g, r;
936
937             if( s == 0 )
938                 b = g = r = v;
939             else
940             {
941                 static const int sector_data[][3]=
942                     {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
943                 float tab[4];
944                 int sector;
945                 h *= _hscale;
946                 if( h < 0 )
947                     do h += 6; while( h < 0 );
948                 else if( h >= 6 )
949                     do h -= 6; while( h >= 6 );
950                 sector = cvFloor(h);
951                 h -= sector;
952                 if( (unsigned)sector >= 6u )
953                 {
954                     sector = 0;
955                     h = 0.f;
956                 }
957
958                 tab[0] = v;
959                 tab[1] = v*(1.f - s);
960                 tab[2] = v*(1.f - s*h);
961                 tab[3] = v*(1.f - s*(1.f - h));
962
963                 b = tab[sector_data[sector][0]];
964                 g = tab[sector_data[sector][1]];
965                 r = tab[sector_data[sector][2]];
966             }
967
968             dst[bidx] = b;
969             dst[1] = g;
970             dst[bidx^2] = r;
971             if( dcn == 4 )
972                 dst[3] = alpha;
973         }
974     }
975
976     int dstcn, blueIdx;
977     float hscale;
978 };
979
980
981 struct HSV2RGB_b
982 {
983     typedef uchar channel_type;
984
985     HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
986     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
987     {}
988
989     void operator()(const uchar* src, uchar* dst, int n) const
990     {
991         int i, j, dcn = dstcn;
992         uchar alpha = ColorChannel<uchar>::max();
993         float buf[3*BLOCK_SIZE];
994
995         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
996         {
997             int dn = std::min(n - i, (int)BLOCK_SIZE);
998
999             for( j = 0; j < dn*3; j += 3 )
1000             {
1001                 buf[j] = src[j];
1002                 buf[j+1] = src[j+1]*(1.f/255.f);
1003                 buf[j+2] = src[j+2]*(1.f/255.f);
1004             }
1005             cvt(buf, buf, dn);
1006
1007             for( j = 0; j < dn*3; j += 3, dst += dcn )
1008             {
1009                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
1010                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
1011                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
1012                 if( dcn == 4 )
1013                     dst[3] = alpha;
1014             }
1015         }
1016     }
1017
1018     int dstcn;
1019     HSV2RGB_f cvt;
1020 };
1021
1022
1023 ///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
1024
1025 struct RGB2HLS_f
1026 {
1027     typedef float channel_type;
1028
1029     RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
1030     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
1031
1032     void operator()(const float* src, float* dst, int n) const
1033     {
1034         int i, bidx = blueIdx, scn = srccn;
1035         float hscale = hrange*(1.f/360.f);
1036         n *= 3;
1037
1038         for( i = 0; i < n; i += 3, src += scn )
1039         {
1040             float b = src[bidx], g = src[1], r = src[bidx^2];
1041             float h = 0.f, s = 0.f, l;
1042             float vmin, vmax, diff;
1043
1044             vmax = vmin = r;
1045             if( vmax < g ) vmax = g;
1046             if( vmax < b ) vmax = b;
1047             if( vmin > g ) vmin = g;
1048             if( vmin > b ) vmin = b;
1049
1050             diff = vmax - vmin;
1051             l = (vmax + vmin)*0.5f;
1052
1053             if( diff > FLT_EPSILON )
1054             {
1055                 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1056                 diff = 60.f/diff;
1057
1058                 if( vmax == r )
1059                     h = (g - b)*diff;
1060                 else if( vmax == g )
1061                     h = (b - r)*diff + 120.f;
1062                 else
1063                     h = (r - g)*diff + 240.f;
1064
1065                 if( h < 0.f ) h += 360.f;
1066             }
1067
1068             dst[i] = h*hscale;
1069             dst[i+1] = l;
1070             dst[i+2] = s;
1071         }
1072     }
1073
1074     int srccn, blueIdx;
1075     float hrange;
1076 };
1077
1078
1079 struct RGB2HLS_b
1080 {
1081     typedef uchar channel_type;
1082
1083     RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
1084     : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) {}
1085
1086     void operator()(const uchar* src, uchar* dst, int n) const
1087     {
1088         int i, j, scn = srccn;
1089         float buf[3*BLOCK_SIZE];
1090
1091         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
1092         {
1093             int dn = std::min(n - i, (int)BLOCK_SIZE);
1094
1095             for( j = 0; j < dn*3; j += 3, src += scn )
1096             {
1097                 buf[j] = src[0]*(1.f/255.f);
1098                 buf[j+1] = src[1]*(1.f/255.f);
1099                 buf[j+2] = src[2]*(1.f/255.f);
1100             }
1101             cvt(buf, buf, dn);
1102
1103             for( j = 0; j < dn*3; j += 3 )
1104             {
1105                 dst[j] = saturate_cast<uchar>(buf[j]);
1106                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
1107                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
1108             }
1109         }
1110     }
1111
1112     int srccn;
1113     RGB2HLS_f cvt;
1114 };
1115
1116
1117 struct HLS2RGB_f
1118 {
1119     typedef float channel_type;
1120
1121     HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
1122     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
1123
1124     void operator()(const float* src, float* dst, int n) const
1125     {
1126         int i, bidx = blueIdx, dcn = dstcn;
1127         float _hscale = hscale;
1128         float alpha = ColorChannel<float>::max();
1129         n *= 3;
1130
1131         for( i = 0; i < n; i += 3, dst += dcn )
1132         {
1133             float h = src[i], l = src[i+1], s = src[i+2];
1134             float b, g, r;
1135
1136             if( s == 0 )
1137                 b = g = r = l;
1138             else
1139             {
1140                 static const int sector_data[][3]=
1141                 {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
1142                 float tab[4];
1143                 int sector;
1144
1145                 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1146                 float p1 = 2*l - p2;
1147
1148                 h *= _hscale;
1149                 if( h < 0 )
1150                     do h += 6; while( h < 0 );
1151                 else if( h >= 6 )
1152                     do h -= 6; while( h >= 6 );
1153
1154                 assert( 0 <= h && h < 6 );
1155                 sector = cvFloor(h);
1156                 h -= sector;
1157
1158                 tab[0] = p2;
1159                 tab[1] = p1;
1160                 tab[2] = p1 + (p2 - p1)*(1-h);
1161                 tab[3] = p1 + (p2 - p1)*h;
1162
1163                 b = tab[sector_data[sector][0]];
1164                 g = tab[sector_data[sector][1]];
1165                 r = tab[sector_data[sector][2]];
1166             }
1167
1168             dst[bidx] = b;
1169             dst[1] = g;
1170             dst[bidx^2] = r;
1171             if( dcn == 4 )
1172                 dst[3] = alpha;
1173         }
1174     }
1175
1176     int dstcn, blueIdx;
1177     float hscale;
1178 };
1179
1180
1181 struct HLS2RGB_b
1182 {
1183     typedef uchar channel_type;
1184
1185     HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
1186     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
1187     {}
1188
1189     void operator()(const uchar* src, uchar* dst, int n) const
1190     {
1191         int i, j, dcn = dstcn;
1192         uchar alpha = ColorChannel<uchar>::max();
1193         float buf[3*BLOCK_SIZE];
1194
1195         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
1196         {
1197             int dn = std::min(n - i, (int)BLOCK_SIZE);
1198
1199             for( j = 0; j < dn*3; j += 3 )
1200             {
1201                 buf[j] = src[j];
1202                 buf[j+1] = src[j+1]*(1.f/255.f);
1203                 buf[j+2] = src[j+2]*(1.f/255.f);
1204             }
1205             cvt(buf, buf, dn);
1206
1207             for( j = 0; j < dn*3; j += 3, dst += dcn )
1208             {
1209                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
1210                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
1211                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
1212                 if( dcn == 4 )
1213                     dst[3] = alpha;
1214             }
1215         }
1216     }
1217
1218     int dstcn;
1219     HLS2RGB_f cvt;
1220 };
1221
1222
1223 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
1224
1225 static const float D65[] = { 0.950456f, 1.f, 1.088754f };
1226
1227 enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };
1228 static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];
1229 static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
1230
1231 static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
1232 static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
1233
1234 static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
1235 #undef lab_shift
1236 #define lab_shift xyz_shift
1237 #define gamma_shift 3
1238 #define lab_shift2 (lab_shift + gamma_shift)
1239 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
1240 static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
1241
1242 static void initLabTabs()
1243 {
1244     static bool initialized = false;
1245     if(!initialized)
1246     {
1247         float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale;
1248         int i;
1249         for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
1250         {
1251             float x = i*scale;
1252             f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
1253         }
1254         splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
1255
1256         scale = 1.f/GammaTabScale;
1257         for(i = 0; i <= GAMMA_TAB_SIZE; i++)
1258         {
1259             float x = i*scale;
1260             g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4);
1261             ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*pow((double)x, 1./2.4) - 0.055);
1262         }
1263         splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
1264         splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
1265
1266         for(i = 0; i < 256; i++)
1267         {
1268             float x = i*(1.f/255.f);
1269             sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4)));
1270             linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
1271         }
1272
1273         for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
1274         {
1275             float x = i*(1.f/(255.f*(1 << gamma_shift)));
1276             LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
1277         }
1278         initialized = true;
1279     }
1280 }
1281
1282 struct RGB2Lab_b
1283 {
1284     typedef uchar channel_type;
1285
1286     RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
1287               const float* _whitept, bool _srgb)
1288     : srccn(_srccn), srgb(_srgb)
1289     {
1290         static volatile int _3 = 3;
1291         initLabTabs();
1292
1293         if(!_coeffs) _coeffs = sRGB2XYZ_D65;
1294         if(!_whitept) _whitept = D65;
1295         float scale[] =
1296         {
1297             (1 << lab_shift)/_whitept[0],
1298             (float)(1 << lab_shift),
1299             (1 << lab_shift)/_whitept[2]
1300         };
1301
1302         for( int i = 0; i < _3; i++ )
1303         {
1304             coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
1305             coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
1306             coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);
1307
1308             CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
1309                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
1310         }
1311     }
1312
1313     void operator()(const uchar* src, uchar* dst, int n) const
1314     {
1315         const int Lscale = (116*255+50)/100;
1316         const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
1317         const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
1318         int i, scn = srccn;
1319         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1320             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1321             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1322         n *= 3;
1323
1324         for( i = 0; i < n; i += 3, src += scn )
1325         {
1326             int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
1327             int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
1328             int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
1329             int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
1330
1331             int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
1332             int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
1333             int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
1334
1335             dst[i] = saturate_cast<uchar>(L);
1336             dst[i+1] = saturate_cast<uchar>(a);
1337             dst[i+2] = saturate_cast<uchar>(b);
1338         }
1339     }
1340
1341     int srccn;
1342     int coeffs[9];
1343     bool srgb;
1344 };
1345
1346
1347 #define clip(value) \
1348     value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value;
1349
1350 struct RGB2Lab_f
1351 {
1352     typedef float channel_type;
1353
1354     RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
1355               const float* _whitept, bool _srgb)
1356     : srccn(_srccn), srgb(_srgb)
1357     {
1358         volatile int _3 = 3;
1359         initLabTabs();
1360
1361         if (!_coeffs)
1362             _coeffs = sRGB2XYZ_D65;
1363         if (!_whitept)
1364             _whitept = D65;
1365
1366         float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
1367
1368         for( int i = 0; i < _3; i++ )
1369         {
1370             int j = i * 3;
1371             coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i];
1372             coeffs[j + 1] = _coeffs[j + 1] * scale[i];
1373             coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i];
1374
1375             CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
1376                        coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
1377         }
1378     }
1379
1380     void operator()(const float* src, float* dst, int n) const
1381     {
1382         int i, scn = srccn;
1383         float gscale = GammaTabScale;
1384         const float* gammaTab = srgb ? sRGBGammaTab : 0;
1385         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1386               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1387               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1388         n *= 3;
1389
1390         static const float _1_3 = 1.0f / 3.0f;
1391         static const float _a = 16.0f / 116.0f;
1392         for (i = 0; i < n; i += 3, src += scn )
1393         {
1394             float R = clip(src[0]);
1395             float G = clip(src[1]);
1396             float B = clip(src[2]);
1397
1398 //            CV_Assert(R >= 0.0f && R <= 1.0f);
1399 //            CV_Assert(G >= 0.0f && G <= 1.0f);
1400 //            CV_Assert(B >= 0.0f && B <= 1.0f);
1401
1402             if (gammaTab)
1403             {
1404                 R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
1405                 G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
1406                 B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
1407             }
1408             float X = R*C0 + G*C1 + B*C2;
1409             float Y = R*C3 + G*C4 + B*C5;
1410             float Z = R*C6 + G*C7 + B*C8;
1411
1412             float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
1413             float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
1414             float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
1415
1416             float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
1417             float a = 500.f * (FX - FY);
1418             float b = 200.f * (FY - FZ);
1419
1420             dst[i] = L;
1421             dst[i + 1] = a;
1422             dst[i + 2] = b;
1423         }
1424     }
1425
1426     int srccn;
1427     float coeffs[9];
1428     bool srgb;
1429 };
1430
1431 struct Lab2RGB_f
1432 {
1433     typedef float channel_type;
1434
1435     Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
1436               const float* _whitept, bool _srgb )
1437     : dstcn(_dstcn), srgb(_srgb), blueInd(blueIdx)
1438     {
1439         initLabTabs();
1440
1441         if(!_coeffs)
1442             _coeffs = XYZ2sRGB_D65;
1443         if(!_whitept)
1444             _whitept = D65;
1445
1446         for( int i = 0; i < 3; i++ )
1447         {
1448             coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
1449             coeffs[i+3] = _coeffs[i+3]*_whitept[i];
1450             coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
1451         }
1452     }
1453
1454     void operator()(const float* src, float* dst, int n) const
1455     {
1456         int i, dcn = dstcn;
1457         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
1458         float gscale = GammaTabScale;
1459         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1460         C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1461         C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1462         float alpha = ColorChannel<float>::max();
1463         n *= 3;
1464
1465         static const float lThresh = 0.008856f * 903.3f;
1466         static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
1467         for (i = 0; i < n; i += 3, dst += dcn)
1468         {
1469             float li = src[i];
1470             float ai = src[i + 1];
1471             float bi = src[i + 2];
1472
1473             float y, fy;
1474             if (li <= lThresh)
1475             {
1476                 y = li / 903.3f;
1477                 fy = 7.787f * y + 16.0f / 116.0f;
1478             }
1479             else
1480             {
1481                 fy = (li + 16.0f) / 116.0f;
1482                 y = fy * fy * fy;
1483             }
1484
1485             float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
1486
1487             for (int j = 0; j < 2; j++)
1488                 if (fxz[j] <= fThresh)
1489                     fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
1490                 else
1491                     fxz[j] = fxz[j] * fxz[j] * fxz[j];
1492
1493
1494             float x = fxz[0], z = fxz[1];
1495             float ro = clip(C0 * x + C1 * y + C2 * z);
1496             float go = clip(C3 * x + C4 * y + C5 * z);
1497             float bo = clip(C6 * x + C7 * y + C8 * z);
1498
1499 //            CV_Assert(ro >= 0.0f && ro <= 1.0f);
1500 //            CV_Assert(go >= 0.0f && go <= 1.0f);
1501 //            CV_Assert(bo >= 0.0f && bo <= 1.0f);
1502
1503             if (gammaTab)
1504             {
1505                 ro = splineInterpolate(ro * gscale, gammaTab, GAMMA_TAB_SIZE);
1506                 go = splineInterpolate(go * gscale, gammaTab, GAMMA_TAB_SIZE);
1507                 bo = splineInterpolate(bo * gscale, gammaTab, GAMMA_TAB_SIZE);
1508             }
1509
1510             dst[0] = ro, dst[1] = go, dst[2] = bo;
1511             if( dcn == 4 )
1512                 dst[3] = alpha;
1513         }
1514     }
1515
1516     int dstcn;
1517     float coeffs[9];
1518     bool srgb;
1519     int blueInd;
1520 };
1521
1522 #undef clip
1523
1524 struct Lab2RGB_b
1525 {
1526     typedef uchar channel_type;
1527
1528     Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
1529                const float* _whitept, bool _srgb )
1530     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}
1531
1532     void operator()(const uchar* src, uchar* dst, int n) const
1533     {
1534         int i, j, dcn = dstcn;
1535         uchar alpha = ColorChannel<uchar>::max();
1536         float buf[3*BLOCK_SIZE];
1537
1538         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
1539         {
1540             int dn = std::min(n - i, (int)BLOCK_SIZE);
1541
1542             for( j = 0; j < dn*3; j += 3 )
1543             {
1544                 buf[j] = src[j]*(100.f/255.f);
1545                 buf[j+1] = (float)(src[j+1] - 128);
1546                 buf[j+2] = (float)(src[j+2] - 128);
1547             }
1548             cvt(buf, buf, dn);
1549
1550             for( j = 0; j < dn*3; j += 3, dst += dcn )
1551             {
1552                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
1553                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
1554                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
1555                 if( dcn == 4 )
1556                     dst[3] = alpha;
1557             }
1558         }
1559     }
1560
1561     int dstcn;
1562     Lab2RGB_f cvt;
1563 };
1564
1565
1566 ///////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
1567
1568 struct RGB2Luv_f
1569 {
1570     typedef float channel_type;
1571
1572     RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
1573                const float* whitept, bool _srgb )
1574     : srccn(_srccn), srgb(_srgb)
1575     {
1576         volatile int i;
1577         initLabTabs();
1578
1579         if(!_coeffs) _coeffs = sRGB2XYZ_D65;
1580         if(!whitept) whitept = D65;
1581
1582         for( i = 0; i < 3; i++ )
1583         {
1584             coeffs[i*3] = _coeffs[i*3];
1585             coeffs[i*3+1] = _coeffs[i*3+1];
1586             coeffs[i*3+2] = _coeffs[i*3+2];
1587             if( blueIdx == 0 )
1588                 std::swap(coeffs[i*3], coeffs[i*3+2]);
1589             CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
1590                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
1591         }
1592
1593         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
1594         un = 4*whitept[0]*d;
1595         vn = 9*whitept[1]*d;
1596
1597         CV_Assert(whitept[1] == 1.f);
1598     }
1599
1600     void operator()(const float* src, float* dst, int n) const
1601     {
1602         int i, scn = srccn;
1603         float gscale = GammaTabScale;
1604         const float* gammaTab = srgb ? sRGBGammaTab : 0;
1605         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1606               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1607               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1608         float _un = 13*un, _vn = 13*vn;
1609         n *= 3;
1610
1611         for( i = 0; i < n; i += 3, src += scn )
1612         {
1613             float R = src[0], G = src[1], B = src[2];
1614             if( gammaTab )
1615             {
1616                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
1617                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
1618                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
1619             }
1620
1621             float X = R*C0 + G*C1 + B*C2;
1622             float Y = R*C3 + G*C4 + B*C5;
1623             float Z = R*C6 + G*C7 + B*C8;
1624
1625             float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1626             L = 116.f*L - 16.f;
1627
1628             float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
1629             float u = L*(X*d - _un);
1630             float v = L*((9*0.25f)*Y*d - _vn);
1631
1632             dst[i] = L; dst[i+1] = u; dst[i+2] = v;
1633         }
1634     }
1635
1636     int srccn;
1637     float coeffs[9], un, vn;
1638     bool srgb;
1639 };
1640
1641
1642 struct Luv2RGB_f
1643 {
1644     typedef float channel_type;
1645
1646     Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
1647               const float* whitept, bool _srgb )
1648     : dstcn(_dstcn), srgb(_srgb)
1649     {
1650         initLabTabs();
1651
1652         if(!_coeffs) _coeffs = XYZ2sRGB_D65;
1653         if(!whitept) whitept = D65;
1654
1655         for( int i = 0; i < 3; i++ )
1656         {
1657             coeffs[i+(blueIdx^2)*3] = _coeffs[i];
1658             coeffs[i+3] = _coeffs[i+3];
1659             coeffs[i+blueIdx*3] = _coeffs[i+6];
1660         }
1661
1662         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
1663         un = 4*whitept[0]*d;
1664         vn = 9*whitept[1]*d;
1665
1666         CV_Assert(whitept[1] == 1.f);
1667     }
1668
1669     void operator()(const float* src, float* dst, int n) const
1670     {
1671         int i, dcn = dstcn;
1672         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
1673         float gscale = GammaTabScale;
1674         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1675               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1676               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1677         float alpha = ColorChannel<float>::max();
1678         float _un = un, _vn = vn;
1679         n *= 3;
1680
1681         for( i = 0; i < n; i += 3, dst += dcn )
1682         {
1683             float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
1684             Y = (L + 16.f) * (1.f/116.f);
1685             Y = Y*Y*Y;
1686             d = (1.f/13.f)/L;
1687             u = u*d + _un;
1688             v = v*d + _vn;
1689             float iv = 1.f/v;
1690             X = 2.25f * u * Y * iv ;
1691             Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
1692
1693             float R = X*C0 + Y*C1 + Z*C2;
1694             float G = X*C3 + Y*C4 + Z*C5;
1695             float B = X*C6 + Y*C7 + Z*C8;
1696
1697             if( gammaTab )
1698             {
1699                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
1700                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
1701                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
1702             }
1703
1704             dst[0] = R; dst[1] = G; dst[2] = B;
1705             if( dcn == 4 )
1706                 dst[3] = alpha;
1707         }
1708     }
1709
1710     int dstcn;
1711     float coeffs[9], un, vn;
1712     bool srgb;
1713 };
1714
1715
1716 struct RGB2Luv_b
1717 {
1718     typedef uchar channel_type;
1719
1720     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
1721                const float* _whitept, bool _srgb )
1722     : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) {}
1723
1724     void operator()(const uchar* src, uchar* dst, int n) const
1725     {
1726         int i, j, scn = srccn;
1727         float buf[3*BLOCK_SIZE];
1728
1729         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
1730         {
1731             int dn = std::min(n - i, (int)BLOCK_SIZE);
1732
1733             for( j = 0; j < dn*3; j += 3, src += scn )
1734             {
1735                 buf[j] = src[0]*(1.f/255.f);
1736                 buf[j+1] = (float)(src[1]*(1.f/255.f));
1737                 buf[j+2] = (float)(src[2]*(1.f/255.f));
1738             }
1739             cvt(buf, buf, dn);
1740
1741             for( j = 0; j < dn*3; j += 3 )
1742             {
1743                 dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
1744                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
1745                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.99609375f + 139.453125f);
1746             }
1747         }
1748     }
1749
1750     int srccn;
1751     RGB2Luv_f cvt;
1752 };
1753
1754
1755 struct Luv2RGB_b
1756 {
1757     typedef uchar channel_type;
1758
1759     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
1760                const float* _whitept, bool _srgb )
1761     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}
1762
1763     void operator()(const uchar* src, uchar* dst, int n) const
1764     {
1765         int i, j, dcn = dstcn;
1766         uchar alpha = ColorChannel<uchar>::max();
1767         float buf[3*BLOCK_SIZE];
1768
1769         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
1770         {
1771             int dn = std::min(n - i, (int)BLOCK_SIZE);
1772
1773             for( j = 0; j < dn*3; j += 3 )
1774             {
1775                 buf[j] = src[j]*(100.f/255.f);
1776                 buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
1777                 buf[j+2] = (float)(src[j+2]*1.003921568627451f - 140.f);
1778             }
1779             cvt(buf, buf, dn);
1780
1781             for( j = 0; j < dn*3; j += 3, dst += dcn )
1782             {
1783                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
1784                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
1785                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
1786                 if( dcn == 4 )
1787                     dst[3] = alpha;
1788             }
1789         }
1790     }
1791
1792     int dstcn;
1793     Luv2RGB_f cvt;
1794 };
1795
1796
1797 //////////////////////////// Bayer Pattern -> RGB conversion /////////////////////////////
1798
1799 template<typename T>
1800 class SIMDBayerStubInterpolator_
1801 {
1802 public:
1803     int bayer2Gray(const T*, int, T*, int, int, int, int) const
1804     {
1805         return 0;
1806     }
1807
1808     int bayer2RGB(const T*, int, T*, int, int) const
1809     {
1810         return 0;
1811     }
1812 };
1813
1814 #if CV_SSE2
1815 class SIMDBayerInterpolator_8u
1816 {
1817 public:
1818     SIMDBayerInterpolator_8u()
1819     {
1820         use_simd = checkHardwareSupport(CV_CPU_SSE2);
1821     }
1822
1823     int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
1824                    int width, int bcoeff, int gcoeff, int rcoeff) const
1825     {
1826         if( !use_simd )
1827             return 0;
1828
1829         __m128i _b2y = _mm_set1_epi16((short)(rcoeff*2));
1830         __m128i _g2y = _mm_set1_epi16((short)(gcoeff*2));
1831         __m128i _r2y = _mm_set1_epi16((short)(bcoeff*2));
1832         const uchar* bayer_end = bayer + width;
1833
1834         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
1835         {
1836             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
1837             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
1838             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
1839
1840             __m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7),
1841                                        _mm_srli_epi16(_mm_slli_epi16(r2, 8), 7));
1842             __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
1843             b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1);
1844
1845             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7));
1846             __m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7);
1847             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
1848             g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2);
1849
1850             r0 = _mm_srli_epi16(r1, 8);
1851             r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2);
1852             r0 = _mm_slli_epi16(r0, 3);
1853
1854             g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y));
1855             g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y));
1856             g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y));
1857             g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y));
1858             g0 = _mm_srli_epi16(g0, 2);
1859             g1 = _mm_srli_epi16(g1, 2);
1860             g0 = _mm_packus_epi16(g0, g0);
1861             g1 = _mm_packus_epi16(g1, g1);
1862             g0 = _mm_unpacklo_epi8(g0, g1);
1863             _mm_storeu_si128((__m128i*)dst, g0);
1864         }
1865
1866         return (int)(bayer - (bayer_end - width));
1867     }
1868
1869     int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
1870     {
1871         if( !use_simd )
1872             return 0;
1873         /*
1874          B G B G | B G B G | B G B G | B G B G
1875          G R G R | G R G R | G R G R | G R G R
1876          B G B G | B G B G | B G B G | B G B G
1877          */
1878         __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
1879         __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128();
1880         __m128i masklo = _mm_set1_epi16(0x00ff);
1881         const uchar* bayer_end = bayer + width;
1882
1883         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
1884         {
1885             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
1886             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
1887             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
1888
1889             __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo));
1890             __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
1891             b1 = _mm_srli_si128(b1, 2);
1892             b1 = _mm_srli_epi16(_mm_add_epi16(b1, delta1), 1);
1893             b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
1894             b0 = _mm_packus_epi16(b0, b1);
1895
1896             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8));
1897             __m128i g1 = _mm_and_si128(r1, masklo);
1898             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
1899             g1 = _mm_srli_si128(g1, 2);
1900             g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2);
1901             g0 = _mm_packus_epi16(g0, g1);
1902
1903             r0 = _mm_srli_epi16(r1, 8);
1904             r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
1905             r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
1906             r0 = _mm_packus_epi16(r0, r1);
1907
1908             b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
1909             b0 = _mm_xor_si128(b0, b1);
1910             r0 = _mm_xor_si128(r0, b1);
1911
1912             // b1 g1 b1 g1 ...
1913             b1 = _mm_unpackhi_epi8(b0, g0);
1914             // b0 g0 b2 g2 b4 g4 ....
1915             b0 = _mm_unpacklo_epi8(b0, g0);
1916
1917             // r1 0 r3 0 ...
1918             r1 = _mm_unpackhi_epi8(r0, z);
1919             // r0 0 r2 0 r4 0 ...
1920             r0 = _mm_unpacklo_epi8(r0, z);
1921
1922             // 0 b0 g0 r0 0 b2 g2 r2 0 ...
1923             g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
1924             // 0 b8 g8 r8 0 b10 g10 r10 0 ...
1925             g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
1926
1927             // b1 g1 r1 0 b3 g3 r3 ....
1928             r0 = _mm_unpacklo_epi16(b1, r1);
1929             // b9 g9 r9 0 ...
1930             r1 = _mm_unpackhi_epi16(b1, r1);
1931
1932             b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
1933             b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
1934
1935             _mm_storel_epi64((__m128i*)(dst-1+0), b0);
1936             _mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8));
1937             _mm_storel_epi64((__m128i*)(dst-1+6*2), b1);
1938             _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8));
1939
1940             g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
1941             g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
1942
1943             _mm_storel_epi64((__m128i*)(dst-1+6*4), g0);
1944             _mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8));
1945
1946             _mm_storel_epi64((__m128i*)(dst-1+6*6), g1);
1947         }
1948
1949         return (int)(bayer - (bayer_end - width));
1950     }
1951
1952     bool use_simd;
1953 };
1954 #else
1955 typedef SIMDBayerStubInterpolator_<uchar> SIMDBayerInterpolator_8u;
1956 #endif
1957
1958 template<typename T, class SIMDInterpolator>
1959 static void Bayer2Gray_( const Mat& srcmat, Mat& dstmat, int code )
1960 {
1961     SIMDInterpolator vecOp;
1962     const int R2Y = 4899;
1963     const int G2Y = 9617;
1964     const int B2Y = 1868;
1965     const int SHIFT = 14;
1966
1967     const T* bayer0 = (const T*)srcmat.data;
1968     int bayer_step = (int)(srcmat.step/sizeof(T));
1969     T* dst0 = (T*)dstmat.data;
1970     int dst_step = (int)(dstmat.step/sizeof(T));
1971     Size size = srcmat.size();
1972     int bcoeff = B2Y, rcoeff = R2Y;
1973     int start_with_green = code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY;
1974     bool brow = true;
1975
1976     if( code != CV_BayerBG2GRAY && code != CV_BayerGB2GRAY )
1977     {
1978         brow = false;
1979         std::swap(bcoeff, rcoeff);
1980     }
1981
1982     dst0 += dst_step + 1;
1983     size.height -= 2;
1984     size.width -= 2;
1985
1986     for( ; size.height-- > 0; bayer0 += bayer_step, dst0 += dst_step )
1987     {
1988         unsigned t0, t1, t2;
1989         const T* bayer = bayer0;
1990         T* dst = dst0;
1991         const T* bayer_end = bayer + size.width;
1992
1993         if( size.width <= 0 )
1994         {
1995             dst[-1] = dst[size.width] = 0;
1996             continue;
1997         }
1998
1999         if( start_with_green )
2000         {
2001             t0 = (bayer[1] + bayer[bayer_step*2+1])*rcoeff;
2002             t1 = (bayer[bayer_step] + bayer[bayer_step+2])*bcoeff;
2003             t2 = bayer[bayer_step+1]*(2*G2Y);
2004
2005             dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+1);
2006             bayer++;
2007             dst++;
2008         }
2009
2010         int delta = vecOp.bayer2Gray(bayer, bayer_step, dst, size.width, bcoeff, G2Y, rcoeff);
2011         bayer += delta;
2012         dst += delta;
2013
2014         for( ; bayer <= bayer_end - 2; bayer += 2, dst += 2 )
2015         {
2016             t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2])*rcoeff;
2017             t1 = (bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1])*G2Y;
2018             t2 = bayer[bayer_step+1]*(4*bcoeff);
2019             dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+2);
2020
2021             t0 = (bayer[2] + bayer[bayer_step*2+2])*rcoeff;
2022             t1 = (bayer[bayer_step+1] + bayer[bayer_step+3])*bcoeff;
2023             t2 = bayer[bayer_step+2]*(2*G2Y);
2024             dst[1] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+1);
2025         }
2026
2027         if( bayer < bayer_end )
2028         {
2029             t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2])*rcoeff;
2030             t1 = (bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1])*G2Y;
2031             t2 = bayer[bayer_step+1]*(4*bcoeff);
2032             dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+2);
2033             bayer++;
2034             dst++;
2035         }
2036
2037         dst0[-1] = dst0[0];
2038         dst0[size.width] = dst0[size.width-1];
2039
2040         brow = !brow;
2041         std::swap(bcoeff, rcoeff);
2042         start_with_green = !start_with_green;
2043     }
2044
2045     size = dstmat.size();
2046     dst0 = (T*)dstmat.data;
2047     if( size.height > 2 )
2048         for( int i = 0; i < size.width; i++ )
2049         {
2050             dst0[i] = dst0[i + dst_step];
2051             dst0[i + (size.height-1)*dst_step] = dst0[i + (size.height-2)*dst_step];
2052         }
2053     else
2054         for( int i = 0; i < size.width; i++ )
2055         {
2056             dst0[i] = dst0[i + (size.height-1)*dst_step] = 0;
2057         }
2058 }
2059
2060 template<typename T, class SIMDInterpolator>
2061 static void Bayer2RGB_( const Mat& srcmat, Mat& dstmat, int code )
2062 {
2063     SIMDInterpolator vecOp;
2064     const T* bayer0 = (const T*)srcmat.data;
2065     int bayer_step = (int)(srcmat.step/sizeof(T));
2066     T* dst0 = (T*)dstmat.data;
2067     int dst_step = (int)(dstmat.step/sizeof(T));
2068     Size size = srcmat.size();
2069     int blue = code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ? -1 : 1;
2070     int start_with_green = code == CV_BayerGB2BGR || code == CV_BayerGR2BGR;
2071
2072     dst0 += dst_step + 3 + 1;
2073     size.height -= 2;
2074     size.width -= 2;
2075
2076     for( ; size.height-- > 0; bayer0 += bayer_step, dst0 += dst_step )
2077     {
2078         int t0, t1;
2079         const T* bayer = bayer0;
2080         T* dst = dst0;
2081         const T* bayer_end = bayer + size.width;
2082
2083         if( size.width <= 0 )
2084         {
2085             dst[-4] = dst[-3] = dst[-2] = dst[size.width*3-1] =
2086             dst[size.width*3] = dst[size.width*3+1] = 0;
2087             continue;
2088         }
2089
2090         if( start_with_green )
2091         {
2092             t0 = (bayer[1] + bayer[bayer_step*2+1] + 1) >> 1;
2093             t1 = (bayer[bayer_step] + bayer[bayer_step+2] + 1) >> 1;
2094             dst[-blue] = (T)t0;
2095             dst[0] = bayer[bayer_step+1];
2096             dst[blue] = (T)t1;
2097             bayer++;
2098             dst += 3;
2099         }
2100
2101         int delta = vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
2102         bayer += delta;
2103         dst += delta*3;
2104
2105         if( blue > 0 )
2106         {
2107             for( ; bayer <= bayer_end - 2; bayer += 2, dst += 6 )
2108             {
2109                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
2110                       bayer[bayer_step*2+2] + 2) >> 2;
2111                 t1 = (bayer[1] + bayer[bayer_step] +
2112                       bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
2113                 dst[-1] = (T)t0;
2114                 dst[0] = (T)t1;
2115                 dst[1] = bayer[bayer_step+1];
2116
2117                 t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
2118                 t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
2119                 dst[2] = (T)t0;
2120                 dst[3] = bayer[bayer_step+2];
2121                 dst[4] = (T)t1;
2122             }
2123         }
2124         else
2125         {
2126             for( ; bayer <= bayer_end - 2; bayer += 2, dst += 6 )
2127             {
2128                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
2129                       bayer[bayer_step*2+2] + 2) >> 2;
2130                 t1 = (bayer[1] + bayer[bayer_step] +
2131                       bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
2132                 dst[1] = (T)t0;
2133                 dst[0] = (T)t1;
2134                 dst[-1] = bayer[bayer_step+1];
2135
2136                 t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
2137                 t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
2138                 dst[4] = (T)t0;
2139                 dst[3] = bayer[bayer_step+2];
2140                 dst[2] = (T)t1;
2141             }
2142         }
2143
2144         if( bayer < bayer_end )
2145         {
2146             t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
2147                   bayer[bayer_step*2+2] + 2) >> 2;
2148             t1 = (bayer[1] + bayer[bayer_step] +
2149                   bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
2150             dst[-blue] = (T)t0;
2151             dst[0] = (T)t1;
2152             dst[blue] = bayer[bayer_step+1];
2153             bayer++;
2154             dst += 3;
2155         }
2156
2157         dst0[-4] = dst0[-1];
2158         dst0[-3] = dst0[0];
2159         dst0[-2] = dst0[1];
2160         dst0[size.width*3-1] = dst0[size.width*3-4];
2161         dst0[size.width*3] = dst0[size.width*3-3];
2162         dst0[size.width*3+1] = dst0[size.width*3-2];
2163
2164         blue = -blue;
2165         start_with_green = !start_with_green;
2166     }
2167
2168     size = dstmat.size();
2169     dst0 = (T*)dstmat.data;
2170     if( size.height > 2 )
2171         for( int i = 0; i < size.width*3; i++ )
2172         {
2173             dst0[i] = dst0[i + dst_step];
2174             dst0[i + (size.height-1)*dst_step] = dst0[i + (size.height-2)*dst_step];
2175         }
2176     else
2177         for( int i = 0; i < size.width*3; i++ )
2178         {
2179             dst0[i] = dst0[i + (size.height-1)*dst_step] = 0;
2180         }
2181 }
2182
2183
2184 /////////////////// Demosaicing using Variable Number of Gradients ///////////////////////
2185
2186 static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
2187 {
2188     const uchar* bayer = srcmat.data;
2189     int bstep = (int)srcmat.step;
2190     uchar* dst = dstmat.data;
2191     int dststep = (int)dstmat.step;
2192     Size size = srcmat.size();
2193
2194     int blueIdx = code == CV_BayerBG2BGR_VNG || code == CV_BayerGB2BGR_VNG ? 0 : 2;
2195     bool greenCell0 = code != CV_BayerBG2BGR_VNG && code != CV_BayerRG2BGR_VNG;
2196
2197     // for too small images use the simple interpolation algorithm
2198     if( MIN(size.width, size.height) < 8 )
2199     {
2200         Bayer2RGB_<uchar, SIMDBayerInterpolator_8u>( srcmat, dstmat, code );
2201         return;
2202     }
2203
2204     const int brows = 3, bcn = 7;
2205     int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;
2206     int i, bufstep = N7*bcn;
2207     cv::AutoBuffer<ushort> _buf(bufstep*brows);
2208     ushort* buf = (ushort*)_buf;
2209
2210     bayer += bstep*2;
2211
2212 #if CV_SSE2
2213     bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2);
2214     #define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))
2215 #endif
2216
2217     for( int y = 2; y < size.height - 4; y++ )
2218     {
2219         uchar* dstrow = dst + dststep*y + 6;
2220         const uchar* srow;
2221
2222         for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ )
2223         {
2224             ushort* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
2225             srow = bayer + (y+dy)*bstep + 1;
2226
2227             for( i = 0; i < bcn; i++ )
2228                 brow[N*i-1] = brow[(N-2) + N*i] = 0;
2229
2230             i = 1;
2231
2232 #if CV_SSE2
2233             if( haveSSE )
2234             {
2235                 __m128i z = _mm_setzero_si128();
2236                 for( ; i <= N-9; i += 8, srow += 8, brow += 8 )
2237                 {
2238                     __m128i s1, s2, s3, s4, s6, s7, s8, s9;
2239
2240                     s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z);
2241                     s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z);
2242                     s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z);
2243
2244                     s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z);
2245                     s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z);
2246
2247                     s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z);
2248                     s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z);
2249                     s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z);
2250
2251                     __m128i b0, b1, b2, b3, b4, b5, b6;
2252
2253                     b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1),
2254                                         _mm_adds_epu16(_mm_absdiff_epu16(s1, s7),
2255                                                        _mm_absdiff_epu16(s3, s9)));
2256                     b1 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s4,s6),1),
2257                                         _mm_adds_epu16(_mm_absdiff_epu16(s1, s3),
2258                                                        _mm_absdiff_epu16(s7, s9)));
2259                     b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1);
2260                     b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1);
2261
2262                     _mm_storeu_si128((__m128i*)brow, b0);
2263                     _mm_storeu_si128((__m128i*)(brow + N), b1);
2264                     _mm_storeu_si128((__m128i*)(brow + N2), b2);
2265                     _mm_storeu_si128((__m128i*)(brow + N3), b3);
2266
2267                     b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4),
2268                                                           _mm_absdiff_epu16(s6, s8)));
2269                     b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6),
2270                                                           _mm_absdiff_epu16(s4, s8)));
2271                     b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8));
2272                     b6 = _mm_srli_epi16(b6, 1);
2273
2274                     _mm_storeu_si128((__m128i*)(brow + N4), b4);
2275                     _mm_storeu_si128((__m128i*)(brow + N5), b5);
2276                     _mm_storeu_si128((__m128i*)(brow + N6), b6);
2277                 }
2278             }
2279 #endif
2280
2281             for( ; i < N-1; i++, srow++, brow++ )
2282             {
2283                 brow[0] = (ushort)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
2284                                    std::abs(srow[-bstep] - srow[+bstep])*2 +
2285                                    std::abs(srow[1-bstep] - srow[1+bstep]));
2286                 brow[N] = (ushort)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
2287                                    std::abs(srow[-1] - srow[1])*2 +
2288                                    std::abs(srow[-1+bstep] - srow[1+bstep]));
2289                 brow[N2] = (ushort)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
2290                 brow[N3] = (ushort)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
2291                 brow[N4] = (ushort)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
2292                                     std::abs(srow[+bstep] - srow[1]));
2293                 brow[N5] = (ushort)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
2294                                     std::abs(srow[+bstep] - srow[-1]));
2295                 brow[N6] = (ushort)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
2296             }
2297         }
2298
2299         const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
2300         const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
2301         const ushort* brow2 = buf + (y % brows)*bufstep + 2;
2302         static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f };
2303         srow = bayer + y*bstep + 2;
2304         bool greenCell = greenCell0;
2305
2306         i = 2;
2307 #if CV_SSE2
2308         int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;
2309 #else
2310         int limit = N - 2;
2311 #endif
2312
2313         do
2314         {
2315             for( ; i < limit; i++, srow++, brow0++, brow1++, brow2++, dstrow += 3 )
2316             {
2317                 int gradN = brow0[0] + brow1[0];
2318                 int gradS = brow1[0] + brow2[0];
2319                 int gradW = brow1[N-1] + brow1[N];
2320                 int gradE = brow1[N] + brow1[N+1];
2321                 int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
2322                 int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
2323                 int R, G, B;
2324
2325                 if( !greenCell )
2326                 {
2327                     int gradNE = brow0[N4+1] + brow1[N4];
2328                     int gradSW = brow1[N4] + brow2[N4-1];
2329                     int gradNW = brow0[N5-1] + brow1[N5];
2330                     int gradSE = brow1[N5] + brow2[N5+1];
2331
2332                     minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
2333                     maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
2334                     int T = minGrad + MAX(maxGrad/2, 1);
2335
2336                     int Rs = 0, Gs = 0, Bs = 0, ng = 0;
2337                     if( gradN < T )
2338                     {
2339                         Rs += srow[-bstep*2] + srow[0];
2340                         Gs += srow[-bstep]*2;
2341                         Bs += srow[-bstep-1] + srow[-bstep+1];
2342                         ng++;
2343                     }
2344                     if( gradS < T )
2345                     {
2346                         Rs += srow[bstep*2] + srow[0];
2347                         Gs += srow[bstep]*2;
2348                         Bs += srow[bstep-1] + srow[bstep+1];
2349                         ng++;
2350                     }
2351                     if( gradW < T )
2352                     {
2353                         Rs += srow[-2] + srow[0];
2354                         Gs += srow[-1]*2;
2355                         Bs += srow[-bstep-1] + srow[bstep-1];
2356                         ng++;
2357                     }
2358                     if( gradE < T )
2359                     {
2360                         Rs += srow[2] + srow[0];
2361                         Gs += srow[1]*2;
2362                         Bs += srow[-bstep+1] + srow[bstep+1];
2363                         ng++;
2364                     }
2365                     if( gradNE < T )
2366                     {
2367                         Rs += srow[-bstep*2+2] + srow[0];
2368                         Gs += brow0[N6+1];
2369                         Bs += srow[-bstep+1]*2;
2370                         ng++;
2371                     }
2372                     if( gradSW < T )
2373                     {
2374                         Rs += srow[bstep*2-2] + srow[0];
2375                         Gs += brow2[N6-1];
2376                         Bs += srow[bstep-1]*2;
2377                         ng++;
2378                     }
2379                     if( gradNW < T )
2380                     {
2381                         Rs += srow[-bstep*2-2] + srow[0];
2382                         Gs += brow0[N6-1];
2383                         Bs += srow[-bstep+1]*2;
2384                         ng++;
2385                     }
2386                     if( gradSE < T )
2387                     {
2388                         Rs += srow[bstep*2+2] + srow[0];
2389                         Gs += brow2[N6+1];
2390                         Bs += srow[-bstep+1]*2;
2391                         ng++;
2392                     }
2393                     R = srow[0];
2394                     G = R + cvRound((Gs - Rs)*scale[ng]);
2395                     B = R + cvRound((Bs - Rs)*scale[ng]);
2396                 }
2397                 else
2398                 {
2399                     int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
2400                     int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
2401                     int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
2402                     int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
2403
2404                     minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
2405                     maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
2406                     int T = minGrad + MAX(maxGrad/2, 1);
2407
2408                     int Rs = 0, Gs = 0, Bs = 0, ng = 0;
2409                     if( gradN < T )
2410                     {
2411                         Rs += srow[-bstep*2-1] + srow[-bstep*2+1];
2412                         Gs += srow[-bstep*2] + srow[0];
2413                         Bs += srow[-bstep]*2;
2414                         ng++;
2415                     }
2416                     if( gradS < T )
2417                     {
2418                         Rs += srow[bstep*2-1] + srow[bstep*2+1];
2419                         Gs += srow[bstep*2] + srow[0];
2420                         Bs += srow[bstep]*2;
2421                         ng++;
2422                     }
2423                     if( gradW < T )
2424                     {
2425                         Rs += srow[-1]*2;
2426                         Gs += srow[-2] + srow[0];
2427                         Bs += srow[-bstep-2]+srow[bstep-2];
2428                         ng++;
2429                     }
2430                     if( gradE < T )
2431                     {
2432                         Rs += srow[1]*2;
2433                         Gs += srow[2] + srow[0];
2434                         Bs += srow[-bstep+2]+srow[bstep+2];
2435                         ng++;
2436                     }
2437                     if( gradNE < T )
2438                     {
2439                         Rs += srow[-bstep*2+1] + srow[1];
2440                         Gs += srow[-bstep+1]*2;
2441                         Bs += srow[-bstep] + srow[-bstep+2];
2442                         ng++;
2443                     }
2444                     if( gradSW < T )
2445                     {
2446                         Rs += srow[bstep*2-1] + srow[-1];
2447                         Gs += srow[bstep-1]*2;
2448                         Bs += srow[bstep] + srow[bstep-2];
2449                         ng++;
2450                     }
2451                     if( gradNW < T )
2452                     {
2453                         Rs += srow[-bstep*2-1] + srow[-1];
2454                         Gs += srow[-bstep-1]*2;
2455                         Bs += srow[-bstep-2]+srow[-bstep];
2456                         ng++;
2457                     }
2458                     if( gradSE < T )
2459                     {
2460                         Rs += srow[bstep*2+1] + srow[1];
2461                         Gs += srow[bstep+1]*2;
2462                         Bs += srow[bstep+2]+srow[bstep];
2463                         ng++;
2464                     }
2465                     G = srow[0];
2466                     R = G + cvRound((Rs - Gs)*scale[ng]);
2467                     B = G + cvRound((Bs - Gs)*scale[ng]);
2468                 }
2469                 dstrow[blueIdx] = CV_CAST_8U(B);
2470                 dstrow[1] = CV_CAST_8U(G);
2471                 dstrow[blueIdx^2] = CV_CAST_8U(R);
2472                 greenCell = !greenCell;
2473             }
2474
2475 #if CV_SSE2
2476             if( !haveSSE )
2477                 break;
2478
2479             __m128i emask    = _mm_set1_epi32(0x0000ffff),
2480                     omask    = _mm_set1_epi32(0xffff0000),
2481                     z        = _mm_setzero_si128(),
2482                     one      = _mm_set1_epi16(1);
2483             __m128 _0_5      = _mm_set1_ps(0.5f);
2484
2485             #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
2486             #define _mm_cvtloepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))   //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
2487             #define _mm_cvthiepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))   //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
2488             #define _mm_loadl_u8_s16(ptr, offset) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)((ptr) + (offset))), z) //load 8 uchars to 8 shorts
2489
2490             // process 8 pixels at once
2491             for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
2492             {
2493                 //int gradN = brow0[0] + brow1[0];
2494                 __m128i gradN = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow0), _mm_loadu_si128((__m128i*)brow1));
2495
2496                 //int gradS = brow1[0] + brow2[0];
2497                 __m128i gradS = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow1), _mm_loadu_si128((__m128i*)brow2));
2498
2499                 //int gradW = brow1[N-1] + brow1[N];
2500                 __m128i gradW = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N-1)), _mm_loadu_si128((__m128i*)(brow1+N)));
2501
2502                 //int gradE = brow1[N+1] + brow1[N];
2503                 __m128i gradE = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N+1)), _mm_loadu_si128((__m128i*)(brow1+N)));
2504
2505                 //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
2506                 //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
2507                 __m128i minGrad = _mm_min_epi16(_mm_min_epi16(gradN, gradS), _mm_min_epi16(gradW, gradE));
2508                 __m128i maxGrad = _mm_max_epi16(_mm_max_epi16(gradN, gradS), _mm_max_epi16(gradW, gradE));
2509
2510                 __m128i grad0, grad1;
2511
2512                 //int gradNE = brow0[N4+1] + brow1[N4];
2513                 //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
2514                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
2515                 grad1 = _mm_adds_epi16( _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N2)), _mm_loadu_si128((__m128i*)(brow0+N2+1))),
2516                                         _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2+1))));
2517                 __m128i gradNE = _mm_merge_epi16(grad0, grad1);
2518
2519                 //int gradSW = brow1[N4] + brow2[N4-1];
2520                 //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
2521                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
2522                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N2)), _mm_loadu_si128((__m128i*)(brow2+N2-1))),
2523                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2-1))));
2524                 __m128i gradSW = _mm_merge_epi16(grad0, grad1);
2525
2526                 minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);
2527                 maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);
2528
2529                 //int gradNW = brow0[N5-1] + brow1[N5];
2530                 //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
2531                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
2532                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N3)), _mm_loadu_si128((__m128i*)(brow0+N3-1))),
2533                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3-1))));
2534                 __m128i gradNW = _mm_merge_epi16(grad0, grad1);
2535
2536                 //int gradSE = brow1[N5] + brow2[N5+1];
2537                 //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
2538                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
2539                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N3)), _mm_loadu_si128((__m128i*)(brow2+N3+1))),
2540                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3+1))));
2541                 __m128i gradSE = _mm_merge_epi16(grad0, grad1);
2542
2543                 minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);
2544                 maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);
2545
2546                 //int T = minGrad + maxGrad/2;
2547                 __m128i T = _mm_adds_epi16(_mm_max_epi16(_mm_srli_epi16(maxGrad, 1), one), minGrad);
2548
2549                 __m128i RGs = z, GRs = z, Bs = z, ng = z;
2550
2551                 __m128i x0  = _mm_loadl_u8_s16(srow, +0          );
2552                 __m128i x1  = _mm_loadl_u8_s16(srow, -1 - bstep  );
2553                 __m128i x2  = _mm_loadl_u8_s16(srow, -1 - bstep*2);
2554                 __m128i x3  = _mm_loadl_u8_s16(srow,    - bstep  );
2555                 __m128i x4  = _mm_loadl_u8_s16(srow, +1 - bstep*2);
2556                 __m128i x5  = _mm_loadl_u8_s16(srow, +1 - bstep  );
2557                 __m128i x6  = _mm_loadl_u8_s16(srow, +2 - bstep  );
2558                 __m128i x7  = _mm_loadl_u8_s16(srow, +1          );
2559                 __m128i x8  = _mm_loadl_u8_s16(srow, +2 + bstep  );
2560                 __m128i x9  = _mm_loadl_u8_s16(srow, +1 + bstep  );
2561                 __m128i x10 = _mm_loadl_u8_s16(srow, +1 + bstep*2);
2562                 __m128i x11 = _mm_loadl_u8_s16(srow,    + bstep  );
2563                 __m128i x12 = _mm_loadl_u8_s16(srow, -1 + bstep*2);
2564                 __m128i x13 = _mm_loadl_u8_s16(srow, -1 + bstep  );
2565                 __m128i x14 = _mm_loadl_u8_s16(srow, -2 + bstep  );
2566                 __m128i x15 = _mm_loadl_u8_s16(srow, -1          );
2567                 __m128i x16 = _mm_loadl_u8_s16(srow, -2 - bstep  );
2568
2569                 __m128i t0, t1, mask;
2570
2571                 // gradN ***********************************************
2572                 mask = _mm_cmpgt_epi16(T, gradN); // mask = T>gradN
2573                 ng = _mm_sub_epi16(ng, mask);     // ng += (T>gradN)
2574
2575                 t0 = _mm_slli_epi16(x3, 1);                                 // srow[-bstep]*2
2576                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2), x0);  // srow[-bstep*2] + srow[0]
2577
2578                 // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
2579                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
2580                 // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
2581                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x2,x4)), mask));
2582                 // Bs  += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
2583                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x5), t0), mask));
2584
2585                 // gradNE **********************************************
2586                 mask = _mm_cmpgt_epi16(T, gradNE); // mask = T>gradNE
2587                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradNE)
2588
2589                 t0 = _mm_slli_epi16(x5, 1);                                    // srow[-bstep+1]*2
2590                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2+2), x0);   // srow[-bstep*2+2] + srow[0]
2591
2592                 // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
2593                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
2594                 // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
2595                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), _mm_adds_epi16(x4,x7)), mask));
2596                 // Bs  += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])}  * (T>gradNE)
2597                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x3,x6)), mask));
2598
2599                 // gradE ***********************************************
2600                 mask = _mm_cmpgt_epi16(T, gradE);  // mask = T>gradE
2601                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradE)
2602
2603                 t0 = _mm_slli_epi16(x7, 1);                         // srow[1]*2
2604                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, 2), x0); // srow[2] + srow[0]
2605
2606                 // RGs += (srow[2] + srow[0]) * (T>gradE)
2607                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
2608                 // GRs += (srow[1]*2) * (T>gradE)
2609                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
2610                 // Bs  += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
2611                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x5,x9), _mm_adds_epi16(x6,x8)), mask));
2612
2613                 // gradSE **********************************************
2614                 mask = _mm_cmpgt_epi16(T, gradSE);  // mask = T>gradSE
2615                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradSE)
2616
2617                 t0 = _mm_slli_epi16(x9, 1);                                 // srow[bstep+1]*2
2618                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2+2), x0); // srow[bstep*2+2] + srow[0]
2619
2620                 // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
2621                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
2622                 // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
2623                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), _mm_adds_epi16(x7,x10)), mask));
2624                 // Bs  += {srow[-bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
2625                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1), _mm_adds_epi16(x8,x11)), mask));
2626
2627                 // gradS ***********************************************
2628                 mask = _mm_cmpgt_epi16(T, gradS);  // mask = T>gradS
2629                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradS)
2630
2631                 t0 = _mm_slli_epi16(x11, 1);                             // srow[bstep]*2
2632                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,bstep*2), x0); // srow[bstep*2]+srow[0]
2633
2634                 // RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
2635                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
2636                 // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
2637                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x10,x12)), mask));
2638                 // Bs  += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
2639                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x9,x13), t0), mask));
2640
2641                 // gradSW **********************************************
2642                 mask = _mm_cmpgt_epi16(T, gradSW);  // mask = T>gradSW
2643                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradSW)
2644
2645                 t0 = _mm_slli_epi16(x13, 1);                                // srow[bstep-1]*2
2646                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2-2), x0); // srow[bstep*2-2]+srow[0]
2647
2648                 // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
2649                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
2650                 // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
2651                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), _mm_adds_epi16(x12,x15)), mask));
2652                 // Bs  += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
2653                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x11,x14)), mask));
2654
2655                 // gradW ***********************************************
2656                 mask = _mm_cmpgt_epi16(T, gradW);  // mask = T>gradW
2657                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradW)
2658
2659                 t0 = _mm_slli_epi16(x15, 1);                         // srow[-1]*2
2660                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -2), x0); // srow[-2]+srow[0]
2661
2662                 // RGs += (srow[-2]+srow[0]) * (T>gradW)
2663                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
2664                 // GRs += (srow[-1]*2) * (T>gradW)
2665                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
2666                 // Bs  += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
2667                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x13), _mm_adds_epi16(x14,x16)), mask));
2668
2669                 // gradNW **********************************************
2670                 mask = _mm_cmpgt_epi16(T, gradNW);  // mask = T>gradNW
2671                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradNW)
2672
2673                 t0 = _mm_slli_epi16(x1, 1);                                 // srow[-bstep-1]*2
2674                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,-bstep*2-2), x0); // srow[-bstep*2-2]+srow[0]
2675
2676                 // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
2677                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
2678                 // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
2679                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), _mm_adds_epi16(x2,x15)), mask));
2680                 // Bs  += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
2681                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1),_mm_adds_epi16(x3,x16)), mask));
2682
2683                 __m128 ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));
2684                 __m128 ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));
2685
2686                 // now interpolate r, g & b
2687                 t0 = _mm_subs_epi16(GRs, RGs);
2688                 t1 = _mm_subs_epi16(Bs, RGs);
2689
2690                 t0 = _mm_add_epi16(x0, _mm_packs_epi32(
2691                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)),
2692                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1))));
2693
2694                 t1 = _mm_add_epi16(x0, _mm_packs_epi32(
2695                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)),
2696                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1))));
2697
2698                 x1 = _mm_merge_epi16(x0, t0);
2699                 x2 = _mm_merge_epi16(t0, x0);
2700
2701                 uchar R[8], G[8], B[8];
2702
2703                 _mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z));
2704                 _mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z));
2705                 _mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z));
2706
2707                 for( int j = 0; j < 8; j++, dstrow += 3 )
2708                 {
2709                     dstrow[0] = B[j]; dstrow[1] = G[j]; dstrow[2] = R[j];
2710                 }
2711             }
2712 #endif
2713
2714             limit = N - 2;
2715         }
2716         while( i < N - 2 );
2717
2718         for( i = 0; i < 6; i++ )
2719         {
2720             dst[dststep*y + 5 - i] = dst[dststep*y + 8 - i];
2721             dst[dststep*y + (N - 2)*3 + i] = dst[dststep*y + (N - 3)*3 + i];
2722         }
2723
2724         greenCell0 = !greenCell0;
2725         blueIdx ^= 2;
2726     }
2727
2728     for( i = 0; i < size.width*3; i++ )
2729     {
2730         dst[i] = dst[i + dststep] = dst[i + dststep*2];
2731         dst[i + dststep*(size.height-4)] =
2732         dst[i + dststep*(size.height-3)] =
2733         dst[i + dststep*(size.height-2)] =
2734         dst[i + dststep*(size.height-1)] = dst[i + dststep*(size.height-5)];
2735     }
2736 }
2737
2738 ///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
2739
2740 const int ITUR_BT_601_CY = 1220542;
2741 const int ITUR_BT_601_CUB = 2116026;
2742 const int ITUR_BT_601_CUG = -409993;
2743 const int ITUR_BT_601_CVG = -852492;
2744 const int ITUR_BT_601_CVR = 1673527;
2745 const int ITUR_BT_601_SHIFT = 20;
2746
2747 // Coefficients for RGB to YUV420p conversion
2748 const int ITUR_BT_601_CRY =  269484;
2749 const int ITUR_BT_601_CGY =  528482;
2750 const int ITUR_BT_601_CBY =  102760;
2751 const int ITUR_BT_601_CRU = -155188;
2752 const int ITUR_BT_601_CGU = -305135;
2753 const int ITUR_BT_601_CBU =  460324;
2754 const int ITUR_BT_601_CGV = -385875;
2755 const int ITUR_BT_601_CBV = -74448;
2756
2757 template<int bIdx, int uIdx>
2758 struct YUV420sp2RGB888Invoker
2759 {
2760     Mat* dst;
2761     const uchar* my1, *muv;
2762     int width, stride;
2763
2764     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
2765         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
2766
2767     void operator()(const BlockedRange& range) const
2768     {
2769         int rangeBegin = range.begin() * 2;
2770         int rangeEnd = range.end() * 2;
2771
2772         //R = 1.164(Y - 16) + 1.596(V - 128)
2773         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
2774         //B = 1.164(Y - 16)                  + 2.018(U - 128)
2775
2776         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
2777         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
2778         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
2779
2780         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
2781
2782 #ifdef HAVE_TEGRA_OPTIMIZATION
2783         if(tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
2784             return;
2785 #endif
2786
2787         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
2788         {
2789             uchar* row1 = dst->ptr<uchar>(j);
2790             uchar* row2 = dst->ptr<uchar>(j + 1);
2791             const uchar* y2 = y1 + stride;
2792
2793             for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
2794             {
2795                 int u = int(uv[i + 0 + uIdx]) - 128;
2796                 int v = int(uv[i + 1 - uIdx]) - 128;
2797
2798                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
2799                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
2800                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
2801
2802                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
2803                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
2804                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
2805                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
2806
2807                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
2808                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
2809                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
2810                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
2811
2812                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
2813                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
2814                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
2815                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
2816
2817                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
2818                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
2819                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
2820                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
2821             }
2822         }
2823     }
2824 };
2825
2826 template<int bIdx, int uIdx>
2827 struct YUV420sp2RGBA8888Invoker
2828 {
2829     Mat* dst;
2830     const uchar* my1, *muv;
2831     int width, stride;
2832
2833     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
2834         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
2835
2836     void operator()(const BlockedRange& range) const
2837     {
2838         int rangeBegin = range.begin() * 2;
2839         int rangeEnd = range.end() * 2;
2840
2841         //R = 1.164(Y - 16) + 1.596(V - 128)
2842         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
2843         //B = 1.164(Y - 16)                  + 2.018(U - 128)
2844
2845         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
2846         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
2847         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
2848
2849         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
2850
2851 #ifdef HAVE_TEGRA_OPTIMIZATION
2852         if(tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
2853             return;
2854 #endif
2855
2856         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
2857         {
2858             uchar* row1 = dst->ptr<uchar>(j);
2859             uchar* row2 = dst->ptr<uchar>(j + 1);
2860             const uchar* y2 = y1 + stride;
2861
2862             for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
2863             {
2864                 int u = int(uv[i + 0 + uIdx]) - 128;
2865                 int v = int(uv[i + 1 - uIdx]) - 128;
2866
2867                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
2868                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
2869                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
2870
2871                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
2872                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
2873                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
2874                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
2875                 row1[3]      = uchar(0xff);
2876
2877                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
2878                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
2879                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
2880                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
2881                 row1[7]      = uchar(0xff);
2882
2883                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
2884                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
2885                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
2886                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
2887                 row2[3]      = uchar(0xff);
2888
2889                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
2890                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
2891                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
2892                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
2893                 row2[7]      = uchar(0xff);
2894             }
2895         }
2896     }
2897 };
2898
2899 template<int bIdx>
2900 struct YUV420p2RGB888Invoker
2901 {
2902     Mat* dst;
2903     const uchar* my1, *mu, *mv;
2904     int width, stride;
2905     int ustepIdx, vstepIdx;
2906
2907     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
2908         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
2909
2910     void operator()(const BlockedRange& range) const
2911     {
2912         const int rangeBegin = range.begin() * 2;
2913         const int rangeEnd = range.end() * 2;
2914
2915         size_t uvsteps[2] = {width/2, stride - width/2};
2916         int usIdx = ustepIdx, vsIdx = vstepIdx;
2917
2918         const uchar* y1 = my1 + rangeBegin * stride;
2919         const uchar* u1 = mu + (range.begin() / 2) * stride;
2920         const uchar* v1 = mv + (range.begin() / 2) * stride;
2921
2922         if(range.begin() % 2 == 1)
2923         {
2924             u1 += uvsteps[(usIdx++) & 1];
2925             v1 += uvsteps[(vsIdx++) & 1];
2926         }
2927
2928         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
2929         {
2930             uchar* row1 = dst->ptr<uchar>(j);
2931             uchar* row2 = dst->ptr<uchar>(j + 1);
2932             const uchar* y2 = y1 + stride;
2933
2934             for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
2935             {
2936                 int u = int(u1[i]) - 128;
2937                 int v = int(v1[i]) - 128;
2938
2939                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
2940                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
2941                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
2942
2943                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
2944                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
2945                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
2946                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
2947
2948                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
2949                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
2950                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
2951                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
2952
2953                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
2954                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
2955                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
2956                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
2957
2958                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
2959                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
2960                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
2961                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
2962             }
2963         }
2964     }
2965 };
2966
2967 template<int bIdx>
2968 struct YUV420p2RGBA8888Invoker
2969 {
2970     Mat* dst;
2971     const uchar* my1, *mu, *mv;
2972     int width, stride;
2973     int ustepIdx, vstepIdx;
2974
2975     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
2976         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
2977
2978     void operator()(const BlockedRange& range) const
2979     {
2980         int rangeBegin = range.begin() * 2;
2981         int rangeEnd = range.end() * 2;
2982
2983         size_t uvsteps[2] = {width/2, stride - width/2};
2984         int usIdx = ustepIdx, vsIdx = vstepIdx;
2985
2986         const uchar* y1 = my1 + rangeBegin * stride;
2987         const uchar* u1 = mu + (range.begin() / 2) * stride;
2988         const uchar* v1 = mv + (range.begin() / 2) * stride;
2989
2990         if(range.begin() % 2 == 1)
2991         {
2992             u1 += uvsteps[(usIdx++) & 1];
2993             v1 += uvsteps[(vsIdx++) & 1];
2994         }
2995
2996         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
2997         {
2998             uchar* row1 = dst->ptr<uchar>(j);
2999             uchar* row2 = dst->ptr<uchar>(j + 1);
3000             const uchar* y2 = y1 + stride;
3001
3002             for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
3003             {
3004                 int u = int(u1[i]) - 128;
3005                 int v = int(v1[i]) - 128;
3006
3007                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
3008                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
3009                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
3010
3011                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
3012                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
3013                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
3014                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
3015                 row1[3]      = uchar(0xff);
3016
3017                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
3018                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
3019                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
3020                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
3021                 row1[7]      = uchar(0xff);
3022
3023                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
3024                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
3025                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
3026                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
3027                 row2[3]      = uchar(0xff);
3028
3029                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
3030                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
3031                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
3032                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
3033                 row2[7]      = uchar(0xff);
3034             }
3035         }
3036     }
3037 };
3038
3039 #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
3040
3041 template<int bIdx, int uIdx>
3042 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
3043 {
3044     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
3045 #ifdef HAVE_TBB
3046     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
3047         parallel_for(BlockedRange(0, _dst.rows/2), converter);
3048     else
3049 #endif
3050         converter(BlockedRange(0, _dst.rows/2));
3051 }
3052
3053 template<int bIdx, int uIdx>
3054 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
3055 {
3056     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
3057 #ifdef HAVE_TBB
3058     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
3059         parallel_for(BlockedRange(0, _dst.rows/2), converter);
3060     else
3061 #endif
3062         converter(BlockedRange(0, _dst.rows/2));
3063 }
3064
3065 template<int bIdx>
3066 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
3067 {
3068     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
3069 #ifdef HAVE_TBB
3070     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
3071         parallel_for(BlockedRange(0, _dst.rows/2), converter);
3072     else
3073 #endif
3074         converter(BlockedRange(0, _dst.rows/2));
3075 }
3076
3077 template<int bIdx>
3078 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
3079 {
3080     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
3081 #ifdef HAVE_TBB
3082     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
3083         parallel_for(BlockedRange(0, _dst.rows/2), converter);
3084     else
3085 #endif
3086         converter(BlockedRange(0, _dst.rows/2));
3087 }
3088
3089 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
3090
3091 template<int bIdx>
3092 struct RGB888toYUV420pInvoker: public ParallelLoopBody
3093 {
3094     RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
3095         : src_(src),
3096           dst_(dst),
3097           uIdx_(uIdx) { }
3098
3099     void operator()(const Range& rowRange) const
3100     {
3101         const int w = src_.cols;
3102         const int h = src_.rows;
3103
3104         const int cn = src_.channels();
3105         for( int i = rowRange.start; i < rowRange.end; i++ )
3106         {
3107             const uchar* row0 = src_.ptr<uchar>(2 * i);
3108             const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
3109
3110             uchar* y = dst_->ptr<uchar>(2*i);
3111             uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
3112             uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
3113             if( uIdx_ == 2 ) std::swap(u, v);
3114
3115             for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
3116             {
3117                 int r00 = row0[2-bIdx + j];      int g00 = row0[1 + j];      int b00 = row0[bIdx + j];
3118                 int r01 = row0[2-bIdx + cn + j]; int g01 = row0[1 + cn + j]; int b01 = row0[bIdx + cn + j];
3119                 int r10 = row1[2-bIdx + j];      int g10 = row1[1 + j];      int b10 = row1[bIdx + j];
3120                 int r11 = row1[2-bIdx + cn + j]; int g11 = row1[1 + cn + j]; int b11 = row1[bIdx + cn + j];
3121
3122                 const int shifted16 = (16 << ITUR_BT_601_SHIFT);
3123                 const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
3124                 int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
3125                 int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
3126                 int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
3127                 int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
3128
3129                 y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
3130                 y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
3131                 y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
3132                 y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
3133
3134                 const int shifted128 = (128 << ITUR_BT_601_SHIFT);
3135                 int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
3136                 int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
3137
3138                 u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
3139                 v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
3140             }
3141         }
3142     }
3143
3144     static bool isFit( const Mat& src )
3145     {
3146         return (src.total() >= 320*240);
3147     }
3148
3149 private:
3150     RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
3151
3152     const Mat& src_;
3153     Mat* const dst_;
3154     const int uIdx_;
3155 };
3156
3157 template<int bIdx, int uIdx>
3158 static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
3159 {
3160     RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
3161     if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
3162         parallel_for_(Range(0, src.rows/2), colorConverter);
3163     else
3164         colorConverter(Range(0, src.rows/2));
3165 }
3166
3167 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
3168
3169 template<int bIdx, int uIdx, int yIdx>
3170 struct YUV422toRGB888Invoker
3171 {
3172     Mat* dst;
3173     const uchar* src;
3174     int width, stride;
3175
3176     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
3177         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
3178
3179     void operator()(const BlockedRange& range) const
3180     {
3181         int rangeBegin = range.begin();
3182         int rangeEnd = range.end();
3183
3184         const int uidx = 1 - yIdx + uIdx * 2;
3185         const int vidx = (2 + uidx) % 4;
3186         const uchar* yuv_src = src + rangeBegin * stride;
3187
3188         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
3189         {
3190             uchar* row = dst->ptr<uchar>(j);
3191
3192             for (int i = 0; i < 2 * width; i += 4, row += 6)
3193             {
3194                 int u = int(yuv_src[i + uidx]) - 128;
3195                 int v = int(yuv_src[i + vidx]) - 128;
3196
3197                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
3198                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
3199                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
3200
3201                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
3202                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
3203                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
3204                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
3205
3206                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
3207                 row[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
3208                 row[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
3209                 row[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
3210             }
3211         }
3212     }
3213 };
3214
3215 template<int bIdx, int uIdx, int yIdx>
3216 struct YUV422toRGBA8888Invoker
3217 {
3218     Mat* dst;
3219     const uchar* src;
3220     int width, stride;
3221
3222     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
3223         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
3224
3225     void operator()(const BlockedRange& range) const
3226     {
3227         int rangeBegin = range.begin();
3228         int rangeEnd = range.end();
3229
3230         const int uidx = 1 - yIdx + uIdx * 2;
3231         const int vidx = (2 + uidx) % 4;
3232         const uchar* yuv_src = src + rangeBegin * stride;
3233
3234         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
3235         {
3236             uchar* row = dst->ptr<uchar>(j);
3237
3238             for (int i = 0; i < 2 * width; i += 4, row += 8)
3239             {
3240                 int u = int(yuv_src[i + uidx]) - 128;
3241                 int v = int(yuv_src[i + vidx]) - 128;
3242
3243                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
3244                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
3245                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
3246
3247                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
3248                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
3249                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
3250                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
3251                 row[3]      = uchar(0xff);
3252
3253                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
3254                 row[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
3255                 row[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
3256                 row[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
3257                 row[7]      = uchar(0xff);
3258             }
3259         }
3260     }
3261 };
3262
3263 #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
3264
3265 template<int bIdx, int uIdx, int yIdx>
3266 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
3267 {
3268     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
3269 #ifdef HAVE_TBB
3270     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
3271         parallel_for(BlockedRange(0, _dst.rows), converter);
3272     else
3273 #endif
3274         converter(BlockedRange(0, _dst.rows));
3275 }
3276
3277 template<int bIdx, int uIdx, int yIdx>
3278 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
3279 {
3280     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
3281 #ifdef HAVE_TBB
3282     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
3283         parallel_for(BlockedRange(0, _dst.rows), converter);
3284     else
3285 #endif
3286         converter(BlockedRange(0, _dst.rows));
3287 }
3288
3289 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
3290
3291 template<typename _Tp>
3292 struct RGBA2mRGBA
3293 {
3294     typedef _Tp channel_type;
3295
3296     void operator()(const _Tp* src, _Tp* dst, int n) const
3297     {
3298         _Tp max_val  = ColorChannel<_Tp>::max();
3299         _Tp half_val = ColorChannel<_Tp>::half();
3300         for( int i = 0; i < n; i++ )
3301         {
3302             _Tp v0 = *src++;
3303             _Tp v1 = *src++;
3304             _Tp v2 = *src++;
3305             _Tp v3 = *src++;
3306
3307             *dst++ = (v0 * v3 + half_val) / max_val;
3308             *dst++ = (v1 * v3 + half_val) / max_val;
3309             *dst++ = (v2 * v3 + half_val) / max_val;
3310             *dst++ = v3;
3311         }
3312     }
3313 };
3314
3315
3316 template<typename _Tp>
3317 struct mRGBA2RGBA
3318 {
3319     typedef _Tp channel_type;
3320
3321     void operator()(const _Tp* src, _Tp* dst, int n) const
3322     {
3323         _Tp max_val = ColorChannel<_Tp>::max();
3324         for( int i = 0; i < n; i++ )
3325         {
3326             _Tp v0 = *src++;
3327             _Tp v1 = *src++;
3328             _Tp v2 = *src++;
3329             _Tp v3 = *src++;
3330             _Tp v3_half = v3 / 2;
3331
3332             *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
3333             *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
3334             *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
3335             *dst++ = v3;
3336         }
3337     }
3338 };
3339
3340 }//namespace cv
3341
3342 //////////////////////////////////////////////////////////////////////////////////////////
3343 //                                   The main function                                  //
3344 //////////////////////////////////////////////////////////////////////////////////////////
3345
3346 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
3347 {
3348     Mat src = _src.getMat(), dst;
3349     Size sz = src.size();
3350     int scn = src.channels(), depth = src.depth(), bidx;
3351
3352     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
3353
3354     switch( code )
3355     {
3356         case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
3357         case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
3358             CV_Assert( scn == 3 || scn == 4 );
3359             dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
3360             bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
3361
3362             _dst.create( sz, CV_MAKETYPE(depth, dcn));
3363             dst = _dst.getMat();
3364
3365             if( depth == CV_8U )
3366             {
3367 #ifdef HAVE_TEGRA_OPTIMIZATION
3368                 if(!tegra::cvtBGR2RGB(src, dst, bidx))
3369 #endif
3370                     CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
3371             }
3372             else if( depth == CV_16U )
3373                 CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
3374             else
3375                 CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
3376             break;
3377
3378         case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
3379         case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
3380             CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
3381             _dst.create(sz, CV_8UC2);
3382             dst = _dst.getMat();
3383
3384 #ifdef HAVE_TEGRA_OPTIMIZATION
3385             if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
3386                 if(tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
3387                     break;
3388 #endif
3389
3390             CvtColorLoop(src, dst, RGB2RGB5x5(scn,
3391                       code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
3392                       code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
3393                       code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
3394                       code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
3395                                               ));
3396             break;
3397
3398         case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
3399         case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
3400             if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
3401             CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
3402             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3403             dst = _dst.getMat();
3404
3405             CvtColorLoop(src, dst, RGB5x52RGB(dcn,
3406                       code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
3407                       code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
3408                       code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
3409                       code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
3410                       ));
3411             break;
3412
3413         case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
3414             CV_Assert( scn == 3 || scn == 4 );
3415             _dst.create(sz, CV_MAKETYPE(depth, 1));
3416             dst = _dst.getMat();
3417
3418             bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
3419
3420             if( depth == CV_8U )
3421             {
3422 #ifdef HAVE_TEGRA_OPTIMIZATION
3423                 if(!tegra::cvtRGB2Gray(src, dst, bidx))
3424 #endif
3425                 CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
3426             }
3427             else if( depth == CV_16U )
3428                 CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
3429             else
3430                 CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
3431             break;
3432
3433         case CV_BGR5652GRAY: case CV_BGR5552GRAY:
3434             CV_Assert( scn == 2 && depth == CV_8U );
3435             _dst.create(sz, CV_8UC1);
3436             dst = _dst.getMat();
3437
3438             CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
3439             break;
3440
3441         case CV_GRAY2BGR: case CV_GRAY2BGRA:
3442             if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
3443             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
3444             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3445             dst = _dst.getMat();
3446
3447             if( depth == CV_8U )
3448             {
3449 #ifdef HAVE_TEGRA_OPTIMIZATION
3450                 if(!tegra::cvtGray2RGB(src, dst))
3451 #endif
3452                 CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
3453             }
3454             else if( depth == CV_16U )
3455                 CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
3456             else
3457                 CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
3458             break;
3459
3460         case CV_GRAY2BGR565: case CV_GRAY2BGR555:
3461             CV_Assert( scn == 1 && depth == CV_8U );
3462             _dst.create(sz, CV_8UC2);
3463             dst = _dst.getMat();
3464
3465             CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
3466             break;
3467
3468         case CV_BGR2YCrCb: case CV_RGB2YCrCb:
3469         case CV_BGR2YUV: case CV_RGB2YUV:
3470             {
3471             CV_Assert( scn == 3 || scn == 4 );
3472             bidx = code == CV_BGR2YCrCb || code == CV_RGB2YUV ? 0 : 2;
3473             static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
3474             static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
3475             const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
3476             const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
3477
3478             _dst.create(sz, CV_MAKETYPE(depth, 3));
3479             dst = _dst.getMat();
3480
3481             if( depth == CV_8U )
3482             {
3483 #ifdef HAVE_TEGRA_OPTIMIZATION
3484                 if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::cvtRGB2YCrCb(src, dst, bidx))
3485                     break;
3486 #endif
3487                 CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
3488             }
3489             else if( depth == CV_16U )
3490                 CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
3491             else
3492                 CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
3493             }
3494             break;
3495
3496         case CV_YCrCb2BGR: case CV_YCrCb2RGB:
3497         case CV_YUV2BGR: case CV_YUV2RGB:
3498             {
3499             if( dcn <= 0 ) dcn = 3;
3500             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
3501             bidx = code == CV_YCrCb2BGR || code == CV_YUV2RGB ? 0 : 2;
3502             static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
3503             static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
3504             const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
3505             const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
3506
3507             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3508             dst = _dst.getMat();
3509
3510             if( depth == CV_8U )
3511                 CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
3512             else if( depth == CV_16U )
3513                 CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
3514             else
3515                 CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
3516             }
3517             break;
3518
3519         case CV_BGR2XYZ: case CV_RGB2XYZ:
3520             CV_Assert( scn == 3 || scn == 4 );
3521             bidx = code == CV_BGR2XYZ ? 0 : 2;
3522
3523             _dst.create(sz, CV_MAKETYPE(depth, 3));
3524             dst = _dst.getMat();
3525
3526             if( depth == CV_8U )
3527                 CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
3528             else if( depth == CV_16U )
3529                 CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
3530             else
3531                 CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
3532             break;
3533
3534         case CV_XYZ2BGR: case CV_XYZ2RGB:
3535             if( dcn <= 0 ) dcn = 3;
3536             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
3537             bidx = code == CV_XYZ2BGR ? 0 : 2;
3538
3539             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3540             dst = _dst.getMat();
3541
3542             if( depth == CV_8U )
3543                 CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
3544             else if( depth == CV_16U )
3545                 CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
3546             else
3547                 CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
3548             break;
3549
3550         case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
3551         case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
3552             {
3553             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
3554             bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
3555                 code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
3556             int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
3557                 code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
3558
3559             _dst.create(sz, CV_MAKETYPE(depth, 3));
3560             dst = _dst.getMat();
3561
3562             if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
3563                 code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
3564             {
3565 #ifdef HAVE_TEGRA_OPTIMIZATION
3566                 if(tegra::cvtRGB2HSV(src, dst, bidx, hrange))
3567                     break;
3568 #endif
3569                 if( depth == CV_8U )
3570                     CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
3571                 else
3572                     CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
3573             }
3574             else
3575             {
3576                 if( depth == CV_8U )
3577                     CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
3578                 else
3579                     CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
3580             }
3581             }
3582             break;
3583
3584         case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
3585         case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
3586             {
3587             if( dcn <= 0 ) dcn = 3;
3588             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
3589             bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
3590                 code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
3591             int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
3592                 code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
3593
3594             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3595             dst = _dst.getMat();
3596
3597             if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
3598                 code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
3599             {
3600                 if( depth == CV_8U )
3601                     CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
3602                 else
3603                     CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
3604             }
3605             else
3606             {
3607                 if( depth == CV_8U )
3608                     CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
3609                 else
3610                     CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
3611             }
3612             }
3613             break;
3614
3615         case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
3616         case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
3617             {
3618             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
3619             bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
3620                    code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
3621             bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
3622                         code == CV_BGR2Luv || code == CV_RGB2Luv;
3623
3624             _dst.create(sz, CV_MAKETYPE(depth, 3));
3625             dst = _dst.getMat();
3626
3627             if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
3628                 code == CV_LBGR2Lab || code == CV_LRGB2Lab )
3629             {
3630                 if( depth == CV_8U )
3631                     CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
3632                 else
3633                     CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
3634             }
3635             else
3636             {
3637                 if( depth == CV_8U )
3638                     CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
3639                 else
3640                     CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
3641             }
3642             }
3643             break;
3644
3645         case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
3646         case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
3647             {
3648             if( dcn <= 0 ) dcn = 3;
3649             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
3650             bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
3651                    code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
3652             bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
3653                     code == CV_Luv2BGR || code == CV_Luv2RGB;
3654
3655             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3656             dst = _dst.getMat();
3657
3658             if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
3659                 code == CV_Lab2LBGR || code == CV_Lab2LRGB )
3660             {
3661                 if( depth == CV_8U )
3662                     CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
3663                 else
3664                     CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
3665             }
3666             else
3667             {
3668                 if( depth == CV_8U )
3669                     CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
3670                 else
3671                     CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
3672             }
3673             }
3674             break;
3675
3676         case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
3677             if(dcn <= 0) dcn = 1;
3678             CV_Assert( scn == 1 && dcn == 1 );
3679
3680             _dst.create(sz, CV_MAKETYPE(depth, dcn));
3681             dst = _dst.getMat();
3682
3683             if( depth == CV_8U )
3684                 Bayer2Gray_<uchar, SIMDBayerInterpolator_8u>(src, dst, code);
3685             else if( depth == CV_16U )
3686                 Bayer2Gray_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
3687             else
3688                 CV_Error(CV_StsUnsupportedFormat, "Bayer->Gray demosaicing only supports 8u and 16u types");
3689             break;
3690
3691         case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
3692         case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
3693             {
3694                 if (dcn <= 0)
3695                     dcn = 3;
3696                 CV_Assert( scn == 1 && dcn == 3 );
3697
3698                 _dst.create(sz, CV_MAKE_TYPE(depth, dcn));
3699                 Mat dst_ = _dst.getMat();
3700
3701                 if( code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ||
3702                     code == CV_BayerRG2BGR || code == CV_BayerGR2BGR )
3703                 {
3704                     if( depth == CV_8U )
3705                         Bayer2RGB_<uchar, SIMDBayerInterpolator_8u>(src, dst_, code);
3706                     else if( depth == CV_16U )
3707                         Bayer2RGB_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst_, code);
3708                     else
3709                         CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB demosaicing only supports 8u and 16u types");
3710                 }
3711                 else
3712                 {
3713                     CV_Assert( depth == CV_8U );
3714                     Bayer2RGB_VNG_8u(src, dst_, code);
3715                 }
3716             }
3717             break;
3718         case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
3719         case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
3720             {
3721                 // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
3722                 // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
3723
3724                 if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
3725                 const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
3726                 const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
3727
3728                 CV_Assert( dcn == 3 || dcn == 4 );
3729                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
3730
3731                 Size dstSz(sz.width, sz.height * 2 / 3);
3732                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
3733                 dst = _dst.getMat();
3734
3735                 int srcstep = (int)src.step;
3736                 const uchar* y = src.ptr();
3737                 const uchar* uv = y + srcstep * dstSz.height;
3738
3739                 switch(dcn*100 + bIdx * 10 + uIdx)
3740                 {
3741                     case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
3742                     case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
3743                     case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
3744                     case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
3745                     case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
3746                     case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
3747                     case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
3748                     case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
3749                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
3750                 };
3751             }
3752             break;
3753         case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
3754         case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
3755             {
3756                 //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
3757                 //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
3758
3759                 if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
3760                 const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
3761                 const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
3762
3763                 CV_Assert( dcn == 3 || dcn == 4 );
3764                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
3765
3766                 Size dstSz(sz.width, sz.height * 2 / 3);
3767                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
3768                 dst = _dst.getMat();
3769
3770                 int srcstep = (int)src.step;
3771                 const uchar* y = src.ptr();
3772                 const uchar* u = y + srcstep * dstSz.height;
3773                 const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
3774
3775                 int ustepIdx = 0;
3776                 int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
3777
3778                 if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); };
3779
3780                 switch(dcn*10 + bIdx)
3781                 {
3782                     case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
3783                     case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
3784                     case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
3785                     case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
3786                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
3787                 };
3788             }
3789             break;
3790         case CV_YUV2GRAY_420:
3791             {
3792                 if (dcn <= 0) dcn = 1;
3793
3794                 CV_Assert( dcn == 1 );
3795                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
3796
3797                 Size dstSz(sz.width, sz.height * 2 / 3);
3798                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
3799                 dst = _dst.getMat();
3800
3801                 src(Range(0, dstSz.height), Range::all()).copyTo(dst);
3802             }
3803             break;
3804         case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
3805         case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
3806             {
3807                 if (dcn <= 0) dcn = 1;
3808                 const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
3809                 const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
3810
3811                 CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
3812                 CV_Assert( dcn == 1 );
3813                 CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
3814
3815                 Size dstSz(sz.width, sz.height / 2 * 3);
3816                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
3817                 dst = _dst.getMat();
3818
3819                 switch(bIdx + uIdx*10)
3820                 {
3821                     case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
3822                     case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
3823                     case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
3824                     case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
3825                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
3826                 };
3827             }
3828             break;
3829         case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
3830         case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
3831         case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
3832             {
3833                 //http://www.fourcc.org/yuv.php#UYVY
3834                 //http://www.fourcc.org/yuv.php#YUY2
3835                 //http://www.fourcc.org/yuv.php#YVYU
3836
3837                 if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
3838                 const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
3839                 const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
3840                 const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
3841
3842                 CV_Assert( dcn == 3 || dcn == 4 );
3843                 CV_Assert( scn == 2 && depth == CV_8U );
3844
3845                 _dst.create(sz, CV_8UC(dcn));
3846                 dst = _dst.getMat();
3847
3848                 switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
3849                 {
3850                     case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3851                     case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3852                     case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3853                     case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3854                     case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3855                     case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3856                     case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3857                     case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3858                     case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3859                     case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3860                     case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3861                     case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3862                     case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3863                     case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3864                     case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
3865                     case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
3866                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
3867                 };
3868             }
3869             break;
3870         case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
3871             {
3872                 if (dcn <= 0) dcn = 1;
3873
3874                 CV_Assert( dcn == 1 );
3875                 CV_Assert( scn == 2 && depth == CV_8U );
3876
3877                 extractChannel(_src, _dst, code == CV_YUV2GRAY_UYVY ? 1 : 0);
3878             }
3879             break;
3880         case CV_RGBA2mRGBA:
3881             {
3882                 if (dcn <= 0) dcn = 4;
3883                 CV_Assert( scn == 4 && dcn == 4 );
3884
3885                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
3886                 dst = _dst.getMat();
3887
3888                 if( depth == CV_8U )
3889                 {
3890                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
3891                 } else {
3892                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
3893                 }
3894             }
3895             break;
3896         case CV_mRGBA2RGBA:
3897             {
3898                 if (dcn <= 0) dcn = 4;
3899                 CV_Assert( scn == 4 && dcn == 4 );
3900
3901                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
3902                 dst = _dst.getMat();
3903
3904                 if( depth == CV_8U )
3905                 {
3906                     CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
3907                 } else {
3908                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
3909                 }
3910             }
3911             break;   
3912         default:
3913             CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
3914     }
3915 }
3916
3917 CV_IMPL void
3918 cvCvtColor( const CvArr* srcarr, CvArr* dstarr, int code )
3919 {
3920     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
3921     CV_Assert( src.depth() == dst.depth() );
3922
3923     cv::cvtColor(src, dst, code, dst.channels());
3924     CV_Assert( dst.data == dst0.data );
3925 }
3926
3927
3928 /* End of file. */