modules/imgproc/src/opencl/cvtcolor.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Jia Haipeng, jiahaipeng95@gmail.com
  19 //    Peng Xiao, pengxiao@multicorewareinc.com
  20 //
  21 // Redistribution and use in source and binary forms, with or without modification,
  22 // are permitted provided that the following conditions are met:
  23 //
  24 //   * Redistribution's of source code must retain the above copyright notice,
  25 //     this list of conditions and the following disclaimer.
  26 //
  27 //   * Redistribution's in binary form must reproduce the above copyright notice,
  28 //     this list of conditions and the following disclaimer in the documentation
  29 //     and/or other materials provided with the distribution.
  30 //
  31 //   * The name of the copyright holders may not be used to endorse or promote products
  32 //     derived from this software without specific prior written permission.
  33 //
  34 // This software is provided by the copyright holders and contributors as is and
  35 // any express or implied warranties, including, but not limited to, the implied
  36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  37 // In no event shall the Intel Corporation or contributors be liable for any direct,
  38 // indirect, incidental, special, exemplary, or consequential damages
  39 // (including, but not limited to, procurement of substitute goods or services;
  40 // loss of use, data, or profits; or business interruption) however caused
  41 // and on any theory of liability, whether in contract, strict liability,
  42 // or tort (including negligence or otherwise) arising in any way out of
  43 // the use of this software, even if advised of the possibility of such damage.
  44 //
  45 //M*/
  46
  47 /**************************************PUBLICFUNC*************************************/
  48
  49 #if depth == 0
  50     #define DATA_TYPE uchar
  51     #define MAX_NUM  255
  52     #define HALF_MAX 128
  53     #define COEFF_TYPE int
  54     #define SAT_CAST(num) convert_uchar_sat(num)
  55     #define DEPTH_0
  56 #elif depth == 2
  57     #define DATA_TYPE ushort
  58     #define MAX_NUM  65535
  59     #define HALF_MAX 32768
  60     #define COEFF_TYPE int
  61     #define SAT_CAST(num) convert_ushort_sat(num)
  62     #define DEPTH_2
  63 #elif depth == 5
  64     #define DATA_TYPE float
  65     #define MAX_NUM  1.0f
  66     #define HALF_MAX 0.5f
  67     #define COEFF_TYPE float
  68     #define SAT_CAST(num) (num)
  69     #define DEPTH_5
  70 #else
  71     #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
  72 #endif
  73
  74 #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
  75
  76 enum
  77 {
  78     yuv_shift  = 14,
  79     xyz_shift  = 12,
  80     hsv_shift  = 12,
  81     R2Y        = 4899,
  82     G2Y        = 9617,
  83     B2Y        = 1868,
  84     BLOCK_SIZE = 256
  85 };
  86
  87 #define scnbytes ((int)sizeof(DATA_TYPE)*scn)
  88 #define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
  89
  90 #ifndef hscale
  91 #define hscale 0
  92 #endif
  93
  94 #ifndef hrange
  95 #define hrange 0
  96 #endif
  97
  98 #if bidx == 0
  99 #define R_COMP z
 100 #define G_COMP y
 101 #define B_COMP x
 102 #elif bidx == 2
 103 #define R_COMP x
 104 #define G_COMP y
 105 #define B_COMP z
 106 #elif bidx == 3
 107 // The only kernel that uses bidx == 3 doesn't use these macros.
 108 // But we still need to make the compiler happy.
 109 #define R_COMP w
 110 #define G_COMP w
 111 #define B_COMP w
 112 #endif
 113
 114 #ifndef uidx
 115 #define uidx 0
 116 #endif
 117
 118 #ifndef yidx
 119 #define yidx 0
 120 #endif
 121
 122 #ifndef PIX_PER_WI_X
 123 #define PIX_PER_WI_X 1
 124 #endif
 125
 126 #define __CAT(x, y) x##y
 127 #define CAT(x, y) __CAT(x, y)
 128
 129 #define DATA_TYPE_4 CAT(DATA_TYPE, 4)
 130
 131 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
 132
 133 __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
 134                        __global uchar * dstptr, int dst_step, int dst_offset,
 135                        int rows, int cols)
 136 {
 137     int x = get_global_id(0);
 138     int y = get_global_id(1) * PIX_PER_WI_Y;
 139
 140     if (x < cols)
 141     {
 142         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 143         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 144
 145         #pragma unroll
 146         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 147         {
 148             if (y < rows)
 149             {
 150                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
 151                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
 152                 DATA_TYPE_4 src_pix = vload4(0, src);
 153 #ifdef DEPTH_5
 154                 dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));
 155 #else
 156                 dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
 157 #endif
 158                 ++y;
 159                 src_index += src_step;
 160                 dst_index += dst_step;
 161             }
 162         }
 163     }
 164 }
 165
 166 __kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
 167                        __global uchar * dstptr, int dst_step, int dst_offset,
 168                        int rows, int cols)
 169 {
 170     int x = get_global_id(0);
 171     int y = get_global_id(1) * PIX_PER_WI_Y;
 172
 173     if (x < cols)
 174     {
 175         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 176         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 177
 178         #pragma unroll
 179         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 180         {
 181             if (y < rows)
 182             {
 183                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
 184                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
 185                 DATA_TYPE val = src[0];
 186 #if dcn == 3 || defined DEPTH_5
 187                 dst[0] = dst[1] = dst[2] = val;
 188 #if dcn == 4
 189                 dst[3] = MAX_NUM;
 190 #endif
 191 #else
 192                 *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
 193 #endif
 194                 ++y;
 195                 dst_index += dst_step;
 196                 src_index += src_step;
 197             }
 198         }
 199     }
 200 }
 201
 202 ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
 203
 204 __constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
 205 __constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };
 206
 207 __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
 208                       __global uchar* dstptr, int dst_step, int dt_offset,
 209                       int rows, int cols)
 210 {
 211     int x = get_global_id(0);
 212     int y = get_global_id(1) * PIX_PER_WI_Y;
 213
 214     if (x < cols)
 215     {
 216         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 217         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
 218
 219         #pragma unroll
 220         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 221         {
 222             if (y < rows)
 223             {
 224                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
 225                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
 226                 DATA_TYPE_4 src_pix = vload4(0, src);
 227                 DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
 228
 229 #ifdef DEPTH_5
 230                 __constant float * coeffs = c_RGB2YUVCoeffs_f;
 231                 const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
 232                 const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX);
 233                 const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX);
 234 #else
 235                 __constant int * coeffs = c_RGB2YUVCoeffs_i;
 236                 const int delta = HALF_MAX * (1 << yuv_shift);
 237                 const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
 238                 const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
 239                 const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
 240 #endif
 241
 242                 dst[0] = SAT_CAST( Y );
 243                 dst[1] = SAT_CAST( U );
 244                 dst[2] = SAT_CAST( V );
 245
 246                 ++y;
 247                 dst_index += dst_step;
 248                 src_index += src_step;
 249             }
 250         }
 251     }
 252 }
 253
 254 __constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };
 255 __constant int   c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };
 256
 257 __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
 258                       __global uchar* dstptr, int dst_step, int dt_offset,
 259                       int rows, int cols)
 260 {
 261     int x = get_global_id(0);
 262     int y = get_global_id(1) * PIX_PER_WI_Y;
 263
 264     if (x < cols)
 265     {
 266         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 267         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
 268
 269         #pragma unroll
 270         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 271         {
 272             if (y < rows)
 273             {
 274                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
 275                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
 276                 DATA_TYPE_4 src_pix = vload4(0, src);
 277                 DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
 278
 279 #ifdef DEPTH_5
 280                 __constant float * coeffs = c_YUV2RGBCoeffs_f;
 281                 float r = fma(V - HALF_MAX, coeffs[3], Y);
 282                 float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y));
 283                 float b = fma(U - HALF_MAX, coeffs[0], Y);
 284 #else
 285                 __constant int * coeffs = c_YUV2RGBCoeffs_i;
 286                 const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift);
 287                 const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift);
 288                 const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift);
 289 #endif
 290
 291                 dst[bidx] = SAT_CAST( b );
 292                 dst[1] = SAT_CAST( g );
 293                 dst[bidx^2] = SAT_CAST( r );
 294 #if dcn == 4
 295                 dst[3] = MAX_NUM;
 296 #endif
 297                 ++y;
 298                 dst_index += dst_step;
 299                 src_index += src_step;
 300             }
 301         }
 302     }
 303 }
 304 __constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,
 305                                             -0.812999725f, 1.5959997177f };
 306
 307 __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,
 308                             __global uchar* dstptr, int dst_step, int dt_offset,
 309                             int rows, int cols)
 310 {
 311     int x = get_global_id(0);
 312     int y = get_global_id(1) * PIX_PER_WI_Y;
 313
 314     if (x < cols / 2)
 315     {
 316         #pragma unroll
 317         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 318         {
 319             if (y < rows / 2 )
 320             {
 321                 __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
 322                 __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
 323                 __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
 324                 __global uchar*       dst2 = dst1 + dst_step;
 325
 326                 float Y1 = ysrc[0];
 327                 float Y2 = ysrc[1];
 328                 float Y3 = ysrc[src_step];
 329                 float Y4 = ysrc[src_step + 1];
 330
 331                 float U  = ((float)usrc[uidx]) - HALF_MAX;
 332                 float V  = ((float)usrc[1-uidx]) - HALF_MAX;
 333
 334                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
 335                 float ruv = fma(coeffs[4], V, 0.5f);
 336                 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
 337                 float buv = fma(coeffs[1], U, 0.5f);
 338
 339                 Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
 340                 dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
 341                 dst1[1]        = convert_uchar_sat(Y1 + guv);
 342                 dst1[bidx]     = convert_uchar_sat(Y1 + buv);
 343 #if dcn == 4
 344                 dst1[3]        = 255;
 345 #endif
 346
 347                 Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
 348                 dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
 349                 dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
 350                 dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
 351 #if dcn == 4
 352                 dst1[7]        = 255;
 353 #endif
 354
 355                 Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
 356                 dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
 357                 dst2[1]        = convert_uchar_sat(Y3 + guv);
 358                 dst2[bidx]     = convert_uchar_sat(Y3 + buv);
 359 #if dcn == 4
 360                 dst2[3]        = 255;
 361 #endif
 362
 363                 Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
 364                 dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
 365                 dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
 366                 dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
 367 #if dcn == 4
 368                 dst2[7]        = 255;
 369 #endif
 370             }
 371             ++y;
 372         }
 373     }
 374 }
 375
 376 __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
 377                                 __global uchar* dstptr, int dst_step, int dt_offset,
 378                                 int rows, int cols)
 379 {
 380     int x = get_global_id(0);
 381     int y = get_global_id(1) * PIX_PER_WI_Y;
 382
 383     if (x < cols / 2)
 384     {
 385         #pragma unroll
 386         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 387         {
 388             if (y < rows / 2 )
 389             {
 390                 __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
 391                 __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
 392                 __global uchar*       dst2 = dst1 + dst_step;
 393
 394                 float Y1 = ysrc[0];
 395                 float Y2 = ysrc[1];
 396                 float Y3 = ysrc[src_step];
 397                 float Y4 = ysrc[src_step + 1];
 398
 399 #ifdef SRC_CONT
 400                 __global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
 401                 int u_ind = mad24(y, cols >> 1, x);
 402                 float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX };
 403 #else
 404                 int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
 405                 __global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
 406                 __global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
 407                 float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX };
 408 #endif
 409                 float U = uv[uidx];
 410                 float V = uv[1-uidx];
 411
 412                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
 413                 float ruv = fma(coeffs[4], V, 0.5f);
 414                 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
 415                 float buv = fma(coeffs[1], U, 0.5f);
 416
 417                 Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
 418                 dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
 419                 dst1[1]        = convert_uchar_sat(Y1 + guv);
 420                 dst1[bidx]     = convert_uchar_sat(Y1 + buv);
 421 #if dcn == 4
 422                 dst1[3]        = 255;
 423 #endif
 424
 425                 Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
 426                 dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
 427                 dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
 428                 dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
 429 #if dcn == 4
 430                 dst1[7]        = 255;
 431 #endif
 432
 433                 Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
 434                 dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
 435                 dst2[1]        = convert_uchar_sat(Y3 + guv);
 436                 dst2[bidx]     = convert_uchar_sat(Y3 + buv);
 437 #if dcn == 4
 438                 dst2[3]        = 255;
 439 #endif
 440
 441                 Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
 442                 dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
 443                 dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
 444                 dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
 445 #if dcn == 4
 446                 dst2[7]        = 255;
 447 #endif
 448             }
 449             ++y;
 450         }
 451     }
 452 }
 453
 454 __constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
 455                                             0.438999176f, -0.3679990768f, -0.0709991455f };
 456
 457 __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
 458                                 __global uchar* dstptr, int dst_step, int dst_offset,
 459                                 int rows, int cols)
 460 {
 461     int x = get_global_id(0) * PIX_PER_WI_X;
 462     int y = get_global_id(1) * PIX_PER_WI_Y;
 463
 464     if (x < cols/2)
 465     {
 466         int src_index  = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
 467         int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
 468         int y_rows = rows / 3 * 2;
 469         int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
 470         __constant float* coeffs = c_RGB2YUVCoeffs_420;
 471
 472         #pragma unroll
 473         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 474         {
 475             if (y < rows / 3)
 476             {
 477                 __global const uchar* src1 = srcptr + src_index;
 478                 __global const uchar* src2 = src1 + src_step;
 479                 __global uchar* ydst1 = dstptr + ydst_index;
 480                 __global uchar* ydst2 = ydst1 + dst_step;
 481
 482                 __global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
 483                 __global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
 484
 485 #if PIX_PER_WI_X == 2
 486                 int s11 = *((__global const int*) src1);
 487                 int s12 = *((__global const int*) src1 + 1);
 488                 int s13 = *((__global const int*) src1 + 2);
 489 #if scn == 4
 490                 int s14 = *((__global const int*) src1 + 3);
 491 #endif
 492                 int s21 = *((__global const int*) src2);
 493                 int s22 = *((__global const int*) src2 + 1);
 494                 int s23 = *((__global const int*) src2 + 2);
 495 #if scn == 4
 496                 int s24 = *((__global const int*) src2 + 3);
 497 #endif
 498                 float src_pix1[scn * 4], src_pix2[scn * 4];
 499
 500                 *((float4*) src_pix1)     = convert_float4(as_uchar4(s11));
 501                 *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
 502                 *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
 503 #if scn == 4
 504                 *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
 505 #endif
 506                 *((float4*) src_pix2)     = convert_float4(as_uchar4(s21));
 507                 *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
 508                 *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
 509 #if scn == 4
 510                 *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
 511 #endif
 512                 uchar4 y1, y2;
 513                 y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-bidx], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      bidx], 16.5f))));
 514                 y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  scn+2-bidx], fma(coeffs[1], src_pix1[  scn+1], fma(coeffs[2], src_pix1[  scn+bidx], 16.5f))));
 515                 y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
 516                 y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
 517                 y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-bidx], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      bidx], 16.5f))));
 518                 y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  scn+2-bidx], fma(coeffs[1], src_pix2[  scn+1], fma(coeffs[2], src_pix2[  scn+bidx], 16.5f))));
 519                 y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
 520                 y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
 521
 522                 *((__global int*) ydst1) = as_int(y1);
 523                 *((__global int*) ydst2) = as_int(y2);
 524
 525                 float uv[4] = { fma(coeffs[3], src_pix1[      2-bidx], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      bidx], 128.5f))),
 526                                 fma(coeffs[5], src_pix1[      2-bidx], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      bidx], 128.5f))),
 527                                 fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
 528                                 fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
 529
 530                 udst[0] = convert_uchar_sat(uv[uidx]    );
 531                 vdst[0] = convert_uchar_sat(uv[1 - uidx]);
 532                 udst[1] = convert_uchar_sat(uv[2 + uidx]);
 533                 vdst[1] = convert_uchar_sat(uv[3 - uidx]);
 534 #else
 535                 float4 src_pix1 = convert_float4(vload4(0, src1));
 536                 float4 src_pix2 = convert_float4(vload4(0, src1+scn));
 537                 float4 src_pix3 = convert_float4(vload4(0, src2));
 538                 float4 src_pix4 = convert_float4(vload4(0, src2+scn));
 539
 540                 ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
 541                 ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
 542                 ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
 543                 ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));
 544
 545                 float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
 546                                 fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
 547
 548                 udst[0] = convert_uchar_sat(uv[uidx]  );
 549                 vdst[0] = convert_uchar_sat(uv[1-uidx]);
 550 #endif
 551                 ++y;
 552                 src_index += 2*src_step;
 553                 ydst_index += 2*dst_step;
 554             }
 555         }
 556     }
 557 }
 558
 559 __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
 560                           __global uchar* dstptr, int dst_step, int dst_offset,
 561                           int rows, int cols)
 562 {
 563     int x = get_global_id(0);
 564     int y = get_global_id(1) * PIX_PER_WI_Y;
 565
 566     if (x < cols / 2)
 567     {
 568         __global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
 569         __global uchar*       dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
 570
 571         #pragma unroll
 572         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 573         {
 574             if (y < rows )
 575             {
 576                 float U = ((float) src[uidx]) - HALF_MAX;
 577                 float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;
 578
 579                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
 580                 float ruv = fma(coeffs[4], V, 0.5f);
 581                 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
 582                 float buv = fma(coeffs[1], U, 0.5f);
 583
 584                 float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
 585                 dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
 586                 dst[1]        = convert_uchar_sat(y00 + guv);
 587                 dst[bidx]     = convert_uchar_sat(y00 + buv);
 588 #if dcn == 4
 589                 dst[3]        = 255;
 590 #endif
 591                 float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
 592                 dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
 593                 dst[dcn + 1]        = convert_uchar_sat(y01 + guv);
 594                 dst[dcn + bidx]     = convert_uchar_sat(y01 + buv);
 595 #if dcn == 4
 596                 dst[7]        = 255;
 597 #endif
 598             }
 599             ++y;
 600             src += src_step;
 601             dst += dst_step;
 602         }
 603     }
 604 }
 605
 606 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
 607
 608 __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
 609 __constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
 610
 611 __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
 612                         __global uchar* dstptr, int dst_step, int dt_offset,
 613                         int rows, int cols)
 614 {
 615     int x = get_global_id(0);
 616     int y = get_global_id(1) * PIX_PER_WI_Y;
 617
 618     if (x < cols)
 619     {
 620         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 621         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
 622
 623         #pragma unroll
 624         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 625         {
 626             if (y < rows)
 627             {
 628                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
 629                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
 630                 DATA_TYPE_4 src_pix = vload4(0, src);
 631                 DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
 632
 633 #ifdef DEPTH_5
 634                 __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
 635                 DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
 636                 DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX);
 637                 DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX);
 638 #else
 639                 __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
 640                 int delta = HALF_MAX * (1 << yuv_shift);
 641                 int Y =  CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
 642                 int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
 643                 int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
 644 #endif
 645
 646                 dst[0] = SAT_CAST( Y );
 647                 dst[1] = SAT_CAST( Cr );
 648                 dst[2] = SAT_CAST( Cb );
 649
 650                 ++y;
 651                 dst_index += dst_step;
 652                 src_index += src_step;
 653             }
 654         }
 655     }
 656 }
 657
 658 __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
 659 __constant int   c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
 660
 661 __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
 662                         __global uchar* dst, int dst_step, int dst_offset,
 663                         int rows, int cols)
 664 {
 665     int x = get_global_id(0);
 666     int y = get_global_id(1) * PIX_PER_WI_Y;
 667
 668     if (x < cols)
 669     {
 670         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 671         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 672
 673         #pragma unroll
 674         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 675         {
 676             if (y < rows)
 677             {
 678                 __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);
 679                 __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);
 680
 681                 DATA_TYPE_4 src_pix = vload4(0, srcptr);
 682                 DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;
 683
 684 #ifdef DEPTH_5
 685                 __constant float * coeff = c_YCrCb2RGBCoeffs_f;
 686                 float r = fma(coeff[0], cr - HALF_MAX, yp);
 687                 float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, yp));
 688                 float b = fma(coeff[3], cb - HALF_MAX, yp);
 689 #else
 690                 __constant int * coeff = c_YCrCb2RGBCoeffs_i;
 691                 int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
 692                 int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift);
 693                 int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
 694 #endif
 695
 696                 dstptr[(bidx^2)] = SAT_CAST(r);
 697                 dstptr[1] = SAT_CAST(g);
 698                 dstptr[bidx] = SAT_CAST(b);
 699 #if dcn == 4
 700                 dstptr[3] = MAX_NUM;
 701 #endif
 702
 703                 ++y;
 704                 dst_index += dst_step;
 705                 src_index += src_step;
 706             }
 707         }
 708     }
 709 }
 710
 711 ///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
 712
 713 __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
 714                       __global uchar * dstptr, int dst_step, int dst_offset,
 715                       int rows, int cols, __constant COEFF_TYPE * coeffs)
 716 {
 717     int dx = get_global_id(0);
 718     int dy = get_global_id(1) * PIX_PER_WI_Y;
 719
 720     if (dx < cols)
 721     {
 722         int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
 723         int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
 724
 725         #pragma unroll
 726         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 727         {
 728             if (dy < rows)
 729             {
 730                 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
 731                 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
 732
 733                 DATA_TYPE_4 src_pix = vload4(0, src);
 734                 DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
 735
 736 #ifdef DEPTH_5
 737                 float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
 738                 float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
 739                 float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
 740 #else
 741                 int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
 742                 int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
 743                 int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
 744 #endif
 745                 dst[0] = SAT_CAST(x);
 746                 dst[1] = SAT_CAST(y);
 747                 dst[2] = SAT_CAST(z);
 748
 749                 ++dy;
 750                 dst_index += dst_step;
 751                 src_index += src_step;
 752             }
 753         }
 754     }
 755 }
 756
 757 __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
 758                       __global uchar * dstptr, int dst_step, int dst_offset,
 759                       int rows, int cols, __constant COEFF_TYPE * coeffs)
 760 {
 761     int dx = get_global_id(0);
 762     int dy = get_global_id(1) * PIX_PER_WI_Y;
 763
 764     if (dx < cols)
 765     {
 766         int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
 767         int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
 768
 769         #pragma unroll
 770         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 771         {
 772             if (dy < rows)
 773             {
 774                 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
 775                 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
 776
 777                 DATA_TYPE_4 src_pix = vload4(0, src);
 778                 DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
 779
 780 #ifdef DEPTH_5
 781                 float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
 782                 float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
 783                 float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
 784 #else
 785                 int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
 786                 int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
 787                 int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
 788 #endif
 789
 790                 DATA_TYPE dst0 = SAT_CAST(b);
 791                 DATA_TYPE dst1 = SAT_CAST(g);
 792                 DATA_TYPE dst2 = SAT_CAST(r);
 793 #if dcn == 3 || defined DEPTH_5
 794                 dst[0] = dst0;
 795                 dst[1] = dst1;
 796                 dst[2] = dst2;
 797 #if dcn == 4
 798                 dst[3] = MAX_NUM;
 799 #endif
 800 #else
 801                 *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
 802 #endif
 803
 804                 ++dy;
 805                 dst_index += dst_step;
 806                 src_index += src_step;
 807             }
 808         }
 809     }
 810 }
 811
 812 ///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
 813
 814 __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
 815                   __global uchar* dstptr, int dst_step, int dst_offset,
 816                   int rows, int cols)
 817 {
 818     int x = get_global_id(0);
 819     int y = get_global_id(1) * PIX_PER_WI_Y;
 820
 821     if (x < cols)
 822     {
 823         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 824         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 825
 826         #pragma unroll
 827         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 828         {
 829             if (y < rows)
 830             {
 831                 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
 832                 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
 833                 DATA_TYPE_4 src_pix = vload4(0, src);
 834
 835 #ifdef REVERSE
 836                 dst[0] = src_pix.z;
 837                 dst[1] = src_pix.y;
 838                 dst[2] = src_pix.x;
 839 #else
 840                 dst[0] = src_pix.x;
 841                 dst[1] = src_pix.y;
 842                 dst[2] = src_pix.z;
 843 #endif
 844
 845 #if dcn == 4
 846 #if scn == 3
 847                 dst[3] = MAX_NUM;
 848 #else
 849                 dst[3] = src[3];
 850 #endif
 851 #endif
 852
 853                 ++y;
 854                 dst_index += dst_step;
 855                 src_index += src_step;
 856             }
 857         }
 858     }
 859 }
 860
 861 ///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
 862
 863 __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
 864                          __global uchar* dst, int dst_step, int dst_offset,
 865                          int rows, int cols)
 866 {
 867     int x = get_global_id(0);
 868     int y = get_global_id(1) * PIX_PER_WI_Y;
 869
 870     if (x < cols)
 871     {
 872         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 873         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 874
 875         #pragma unroll
 876         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 877         {
 878             if (y < rows)
 879             {
 880                 ushort t = *((__global const ushort*)(src + src_index));
 881
 882 #if greenbits == 6
 883                 dst[dst_index + bidx] = (uchar)(t << 3);
 884                 dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
 885                 dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
 886 #else
 887                 dst[dst_index + bidx] = (uchar)(t << 3);
 888                 dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
 889                 dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
 890 #endif
 891
 892 #if dcn == 4
 893 #if greenbits == 6
 894                 dst[dst_index + 3] = 255;
 895 #else
 896                 dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
 897 #endif
 898 #endif
 899
 900                 ++y;
 901                 dst_index += dst_step;
 902                 src_index += src_step;
 903             }
 904         }
 905     }
 906 }
 907
 908 __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
 909                          __global uchar* dst, int dst_step, int dst_offset,
 910                          int rows, int cols)
 911 {
 912     int x = get_global_id(0);
 913     int y = get_global_id(1) * PIX_PER_WI_Y;
 914
 915     if (x < cols)
 916     {
 917         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 918         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 919
 920         #pragma unroll
 921         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 922         {
 923             if (y < rows)
 924             {
 925                 uchar4 src_pix = vload4(0, src + src_index);
 926
 927 #if greenbits == 6
 928                     *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
 929 #elif scn == 3
 930                     *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
 931 #else
 932                     *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
 933                         ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
 934 #endif
 935
 936                 ++y;
 937                 dst_index += dst_step;
 938                 src_index += src_step;
 939             }
 940         }
 941     }
 942 }
 943
 944 ///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
 945
 946 __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
 947                           __global uchar* dst, int dst_step, int dst_offset,
 948                           int rows, int cols)
 949 {
 950     int x = get_global_id(0);
 951     int y = get_global_id(1) * PIX_PER_WI_Y;
 952
 953     if (x < cols)
 954     {
 955         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
 956         int dst_index = mad24(y, dst_step, dst_offset + x);
 957
 958         #pragma unroll
 959         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 960         {
 961             if (y < rows)
 962             {
 963                 int t = *((__global const ushort*)(src + src_index));
 964
 965 #if greenbits == 6
 966                 dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
 967 #else
 968                 dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
 969 #endif
 970                 ++y;
 971                 dst_index += dst_step;
 972                 src_index += src_step;
 973             }
 974         }
 975     }
 976 }
 977
 978 __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
 979                           __global uchar* dst, int dst_step, int dst_offset,
 980                           int rows, int cols)
 981 {
 982     int x = get_global_id(0);
 983     int y = get_global_id(1) * PIX_PER_WI_Y;
 984
 985     if (x < cols)
 986     {
 987         int src_index = mad24(y, src_step, src_offset + x);
 988         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
 989
 990         #pragma unroll
 991         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
 992         {
 993             if (y < rows)
 994             {
 995                 int t = src[src_index];
 996
 997 #if greenbits == 6
 998                 *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
 999 #else
1000                 t >>= 3;
1001                 *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
1002 #endif
1003                 ++y;
1004                 dst_index += dst_step;
1005                 src_index += src_step;
1006             }
1007         }
1008     }
1009 }
1010
1011 //////////////////////////////////// RGB <-> HSV //////////////////////////////////////
1012
1013 __constant int sector_data[][3] = { { 1, 3, 0 },
1014                                     { 1, 0, 2 },
1015                                     { 3, 0, 1 },
1016                                     { 0, 2, 1 },
1017                                     { 0, 1, 3 },
1018                                     { 2, 1, 0 } };
1019
1020 #ifdef DEPTH_0
1021
1022 __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
1023                       __global uchar* dst, int dst_step, int dst_offset,
1024                       int rows, int cols,
1025                       __constant int * sdiv_table, __constant int * hdiv_table)
1026 {
1027     int x = get_global_id(0);
1028     int y = get_global_id(1) * PIX_PER_WI_Y;
1029
1030     if (x < cols)
1031     {
1032         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1033         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1034
1035         #pragma unroll
1036         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1037         {
1038             if (y < rows)
1039             {
1040                 uchar4 src_pix = vload4(0, src + src_index);
1041
1042                 int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1043                 int h, s, v = b;
1044                 int vmin = b, diff;
1045                 int vr, vg;
1046
1047                 v = max(v, g);
1048                 v = max(v, r);
1049                 vmin = min(vmin, g);
1050                 vmin = min(vmin, r);
1051
1052                 diff = v - vmin;
1053                 vr = v == r ? -1 : 0;
1054                 vg = v == g ? -1 : 0;
1055
1056                 s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
1057                 h = (vr & (g - b)) +
1058                     (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
1059                 h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
1060                 h += h < 0 ? hrange : 0;
1061
1062                 dst[dst_index] = convert_uchar_sat_rte(h);
1063                 dst[dst_index + 1] = (uchar)s;
1064                 dst[dst_index + 2] = (uchar)v;
1065
1066                 ++y;
1067                 dst_index += dst_step;
1068                 src_index += src_step;
1069             }
1070         }
1071     }
1072 }
1073
1074 __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
1075                       __global uchar* dst, int dst_step, int dst_offset,
1076                       int rows, int cols)
1077 {
1078     int x = get_global_id(0);
1079     int y = get_global_id(1) * PIX_PER_WI_Y;
1080
1081     if (x < cols)
1082     {
1083         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1084         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1085
1086         #pragma unroll
1087         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1088         {
1089             if (y < rows)
1090             {
1091                 uchar4 src_pix = vload4(0, src + src_index);
1092
1093                 float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
1094                 float b, g, r;
1095
1096                 if (s != 0)
1097                 {
1098                     float tab[4];
1099                     int sector;
1100                     h *= hscale;
1101                     if( h < 0 )
1102                         do h += 6; while( h < 0 );
1103                     else if( h >= 6 )
1104                         do h -= 6; while( h >= 6 );
1105                     sector = convert_int_sat_rtn(h);
1106                     h -= sector;
1107                     if( (unsigned)sector >= 6u )
1108                     {
1109                         sector = 0;
1110                         h = 0.f;
1111                     }
1112
1113                     tab[0] = v;
1114                     tab[1] = v*(1.f - s);
1115                     tab[2] = v*(1.f - s*h);
1116                     tab[3] = v*(1.f - s*(1.f - h));
1117
1118                     b = tab[sector_data[sector][0]];
1119                     g = tab[sector_data[sector][1]];
1120                     r = tab[sector_data[sector][2]];
1121                 }
1122                 else
1123                     b = g = r = v;
1124
1125                 dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
1126                 dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
1127                 dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
1128 #if dcn == 4
1129                 dst[dst_index + 3] = MAX_NUM;
1130 #endif
1131
1132                 ++y;
1133                 dst_index += dst_step;
1134                 src_index += src_step;
1135             }
1136         }
1137     }
1138 }
1139
1140 #elif defined DEPTH_5
1141
1142 __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
1143                       __global uchar* dstptr, int dst_step, int dst_offset,
1144                       int rows, int cols)
1145 {
1146     int x = get_global_id(0);
1147     int y = get_global_id(1) * PIX_PER_WI_Y;
1148
1149     if (x < cols)
1150     {
1151         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1152         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1153
1154         #pragma unroll
1155         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1156         {
1157             if (y < rows)
1158             {
1159                 __global const float * src = (__global const float *)(srcptr + src_index);
1160                 __global float * dst = (__global float *)(dstptr + dst_index);
1161                 float4 src_pix = vload4(0, src);
1162
1163                 float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1164                 float h, s, v;
1165
1166                 float vmin, diff;
1167
1168                 v = vmin = r;
1169                 if( v < g ) v = g;
1170                 if( v < b ) v = b;
1171                 if( vmin > g ) vmin = g;
1172                 if( vmin > b ) vmin = b;
1173
1174                 diff = v - vmin;
1175                 s = diff/(float)(fabs(v) + FLT_EPSILON);
1176                 diff = (float)(60.f/(diff + FLT_EPSILON));
1177                 if( v == r )
1178                     h = (g - b)*diff;
1179                 else if( v == g )
1180                     h = fma(b - r, diff, 120.f);
1181                 else
1182                     h = fma(r - g, diff, 240.f);
1183
1184                 if( h < 0 )
1185                     h += 360.f;
1186
1187                 dst[0] = h*hscale;
1188                 dst[1] = s;
1189                 dst[2] = v;
1190
1191                 ++y;
1192                 dst_index += dst_step;
1193                 src_index += src_step;
1194             }
1195         }
1196     }
1197 }
1198
1199 __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
1200                       __global uchar* dstptr, int dst_step, int dst_offset,
1201                       int rows, int cols)
1202 {
1203     int x = get_global_id(0);
1204     int y = get_global_id(1) * PIX_PER_WI_Y;
1205
1206     if (x < cols)
1207     {
1208         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1209         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1210
1211         #pragma unroll
1212         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1213         {
1214             if (y < rows)
1215             {
1216
1217                 __global const float * src = (__global const float *)(srcptr + src_index);
1218                 __global float * dst = (__global float *)(dstptr + dst_index);
1219                 float4 src_pix = vload4(0, src);
1220
1221                 float h = src_pix.x, s = src_pix.y, v = src_pix.z;
1222                 float b, g, r;
1223
1224                 if (s != 0)
1225                 {
1226                     float tab[4];
1227                     int sector;
1228                     h *= hscale;
1229                     if(h < 0)
1230                         do h += 6; while (h < 0);
1231                     else if (h >= 6)
1232                         do h -= 6; while (h >= 6);
1233                     sector = convert_int_sat_rtn(h);
1234                     h -= sector;
1235                     if ((unsigned)sector >= 6u)
1236                     {
1237                         sector = 0;
1238                         h = 0.f;
1239                     }
1240
1241                     tab[0] = v;
1242                     tab[1] = v*(1.f - s);
1243                     tab[2] = v*(1.f - s*h);
1244                     tab[3] = v*(1.f - s*(1.f - h));
1245
1246                     b = tab[sector_data[sector][0]];
1247                     g = tab[sector_data[sector][1]];
1248                     r = tab[sector_data[sector][2]];
1249                 }
1250                 else
1251                     b = g = r = v;
1252
1253                 dst[bidx] = b;
1254                 dst[1] = g;
1255                 dst[bidx^2] = r;
1256 #if dcn == 4
1257                 dst[3] = MAX_NUM;
1258 #endif
1259
1260                 ++y;
1261                 dst_index += dst_step;
1262                 src_index += src_step;
1263             }
1264         }
1265     }
1266 }
1267
1268 #endif
1269
1270 ///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
1271
1272 #ifdef DEPTH_0
1273
1274 __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
1275                       __global uchar* dst, int dst_step, int dst_offset,
1276                       int rows, int cols)
1277 {
1278     int x = get_global_id(0);
1279     int y = get_global_id(1) * PIX_PER_WI_Y;
1280
1281     if (x < cols)
1282     {
1283         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1284         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1285
1286         #pragma unroll
1287         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1288         {
1289             if (y < rows)
1290             {
1291                 uchar4 src_pix = vload4(0, src + src_index);
1292
1293                 float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
1294                 float h = 0.f, s = 0.f, l;
1295                 float vmin, vmax, diff;
1296
1297                 vmax = vmin = r;
1298                 if (vmax < g) vmax = g;
1299                 if (vmax < b) vmax = b;
1300                 if (vmin > g) vmin = g;
1301                 if (vmin > b) vmin = b;
1302
1303                 diff = vmax - vmin;
1304                 l = (vmax + vmin)*0.5f;
1305
1306                 if (diff > FLT_EPSILON)
1307                 {
1308                     s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1309                     diff = 60.f/diff;
1310
1311                     if( vmax == r )
1312                         h = (g - b)*diff;
1313                     else if( vmax == g )
1314                         h = fma(b - r, diff, 120.f);
1315                     else
1316                         h = fma(r - g, diff, 240.f);
1317
1318                     if( h < 0.f )
1319                         h += 360.f;
1320                 }
1321
1322                 dst[dst_index] = convert_uchar_sat_rte(h*hscale);
1323                 dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
1324                 dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
1325
1326                 ++y;
1327                 dst_index += dst_step;
1328                 src_index += src_step;
1329             }
1330         }
1331     }
1332 }
1333
1334 __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
1335                       __global uchar* dst, int dst_step, int dst_offset,
1336                       int rows, int cols)
1337 {
1338     int x = get_global_id(0);
1339     int y = get_global_id(1) * PIX_PER_WI_Y;
1340
1341     if (x < cols)
1342     {
1343         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1344         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1345
1346         #pragma unroll
1347         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1348         {
1349             if (y < rows)
1350             {
1351                 uchar4 src_pix = vload4(0, src + src_index);
1352
1353                 float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
1354                 float b, g, r;
1355
1356                 if (s != 0)
1357                 {
1358                     float tab[4];
1359
1360                     float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1361                     float p1 = 2*l - p2;
1362
1363                     h *= hscale;
1364                     if( h < 0 )
1365                         do h += 6; while( h < 0 );
1366                     else if( h >= 6 )
1367                         do h -= 6; while( h >= 6 );
1368
1369                     int sector = convert_int_sat_rtn(h);
1370                     h -= sector;
1371
1372                     tab[0] = p2;
1373                     tab[1] = p1;
1374                     tab[2] = fma(p2 - p1, 1-h, p1);
1375                     tab[3] = fma(p2 - p1, h, p1);
1376
1377                     b = tab[sector_data[sector][0]];
1378                     g = tab[sector_data[sector][1]];
1379                     r = tab[sector_data[sector][2]];
1380                 }
1381                 else
1382                     b = g = r = l;
1383
1384                 dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
1385                 dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
1386                 dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
1387 #if dcn == 4
1388                 dst[dst_index + 3] = MAX_NUM;
1389 #endif
1390
1391                 ++y;
1392                 dst_index += dst_step;
1393                 src_index += src_step;
1394             }
1395         }
1396     }
1397 }
1398
1399 #elif defined DEPTH_5
1400
1401 __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
1402                       __global uchar* dstptr, int dst_step, int dst_offset,
1403                       int rows, int cols)
1404 {
1405     int x = get_global_id(0);
1406     int y = get_global_id(1) * PIX_PER_WI_Y;
1407
1408     if (x < cols)
1409     {
1410         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1411         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1412
1413         #pragma unroll
1414         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1415         {
1416             if (y < rows)
1417             {
1418                 __global const float * src = (__global const float *)(srcptr + src_index);
1419                 __global float * dst = (__global float *)(dstptr + dst_index);
1420                 float4 src_pix = vload4(0, src);
1421
1422                 float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1423                 float h = 0.f, s = 0.f, l;
1424                 float vmin, vmax, diff;
1425
1426                 vmax = vmin = r;
1427                 if (vmax < g) vmax = g;
1428                 if (vmax < b) vmax = b;
1429                 if (vmin > g) vmin = g;
1430                 if (vmin > b) vmin = b;
1431
1432                 diff = vmax - vmin;
1433                 l = (vmax + vmin)*0.5f;
1434
1435                 if (diff > FLT_EPSILON)
1436                 {
1437                     s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1438                     diff = 60.f/diff;
1439
1440                     if( vmax == r )
1441                         h = (g - b)*diff;
1442                     else if( vmax == g )
1443                         h = fma(b - r, diff, 120.f);
1444                     else
1445                         h = fma(r - g, diff, 240.f);
1446
1447                     if( h < 0.f ) h += 360.f;
1448                 }
1449
1450                 dst[0] = h*hscale;
1451                 dst[1] = l;
1452                 dst[2] = s;
1453
1454                 ++y;
1455                 dst_index += dst_step;
1456                 src_index += src_step;
1457             }
1458         }
1459     }
1460 }
1461
1462 __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
1463                       __global uchar* dstptr, int dst_step, int dst_offset,
1464                       int rows, int cols)
1465 {
1466     int x = get_global_id(0);
1467     int y = get_global_id(1) * PIX_PER_WI_Y;
1468
1469     if (x < cols)
1470     {
1471         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1472         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1473
1474         #pragma unroll
1475         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1476         {
1477             if (y < rows)
1478             {
1479                 __global const float * src = (__global const float *)(srcptr + src_index);
1480                 __global float * dst = (__global float *)(dstptr + dst_index);
1481                 float4 src_pix = vload4(0, src);
1482
1483                 float h = src_pix.x, l = src_pix.y, s = src_pix.z;
1484                 float b, g, r;
1485
1486                 if (s != 0)
1487                 {
1488                     float tab[4];
1489                     int sector;
1490
1491                     float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1492                     float p1 = 2*l - p2;
1493
1494                     h *= hscale;
1495                     if( h < 0 )
1496                         do h += 6; while( h < 0 );
1497                     else if( h >= 6 )
1498                         do h -= 6; while( h >= 6 );
1499
1500                     sector = convert_int_sat_rtn(h);
1501                     h -= sector;
1502
1503                     tab[0] = p2;
1504                     tab[1] = p1;
1505                     tab[2] = fma(p2 - p1, 1-h, p1);
1506                     tab[3] = fma(p2 - p1, h, p1);
1507
1508                     b = tab[sector_data[sector][0]];
1509                     g = tab[sector_data[sector][1]];
1510                     r = tab[sector_data[sector][2]];
1511                 }
1512                 else
1513                     b = g = r = l;
1514
1515                 dst[bidx] = b;
1516                 dst[1] = g;
1517                 dst[bidx^2] = r;
1518 #if dcn == 4
1519                 dst[3] = MAX_NUM;
1520 #endif
1521
1522                 ++y;
1523                 dst_index += dst_step;
1524                 src_index += src_step;
1525             }
1526         }
1527     }
1528 }
1529
1530 #endif
1531
1532 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
1533
1534 #ifdef DEPTH_0
1535
1536 __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
1537                          __global uchar* dst, int dst_step, int dst_offset,
1538                          int rows, int cols)
1539 {
1540     int x = get_global_id(0);
1541     int y = get_global_id(1) * PIX_PER_WI_Y;
1542
1543     if (x < cols)
1544     {
1545         int src_index = mad24(y, src_step, src_offset + (x << 2));
1546         int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
1547
1548         #pragma unroll
1549         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1550         {
1551             if (y < rows)
1552             {
1553                 uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
1554
1555                 *(__global uchar4 *)(dst + dst_index) =
1556                     (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM,
1557                              mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM,
1558                              mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w);
1559
1560                 ++y;
1561                 dst_index += dst_step;
1562                 src_index += src_step;
1563             }
1564         }
1565     }
1566 }
1567
1568 __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
1569                          __global uchar* dst, int dst_step, int dst_offset,
1570                          int rows, int cols)
1571 {
1572     int x = get_global_id(0);
1573     int y = get_global_id(1) * PIX_PER_WI_Y;
1574
1575     if (x < cols)
1576     {
1577         int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
1578         int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
1579
1580         #pragma unroll
1581         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1582         {
1583             if (y < rows)
1584             {
1585                 uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
1586                 uchar v3 = src_pix.w, v3_half = v3 / 2;
1587
1588                 if (v3 == 0)
1589                     *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
1590                 else
1591                     *(__global uchar4 *)(dst + dst_index) =
1592                         (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
1593                                  mad24(src_pix.y, MAX_NUM, v3_half) / v3,
1594                                  mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
1595
1596                 ++y;
1597                 dst_index += dst_step;
1598                 src_index += src_step;
1599             }
1600         }
1601     }
1602 }
1603
1604 #endif
1605
1606 /////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
1607
1608 #define lab_shift xyz_shift
1609 #define gamma_shift 3
1610 #define lab_shift2 (lab_shift + gamma_shift)
1611 #define GAMMA_TAB_SIZE 1024
1612 #define GammaTabScale (float)GAMMA_TAB_SIZE
1613
1614 inline float splineInterpolate(float x, __global const float * tab, int n)
1615 {
1616     int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
1617     x -= ix;
1618     tab += ix << 2;
1619     return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
1620 }
1621
1622 #ifdef DEPTH_0
1623
1624 __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
1625                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1626                       __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
1627                       __constant int * coeffs, int Lscale, int Lshift)
1628 {
1629     int x = get_global_id(0);
1630     int y = get_global_id(1) * PIX_PER_WI_Y;
1631
1632     if (x < cols)
1633     {
1634         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1635         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1636
1637         #pragma unroll
1638         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1639         {
1640             if (y < rows)
1641             {
1642                 __global const uchar* src_ptr = src + src_index;
1643                 __global uchar* dst_ptr = dst + dst_index;
1644                 uchar4 src_pix = vload4(0, src_ptr);
1645
1646                 int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1647                     C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1648                     C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1649
1650                 int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
1651                 int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
1652                 int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
1653                 int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
1654
1655                 int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
1656                 int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
1657                 int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
1658
1659                 dst_ptr[0] = SAT_CAST(L);
1660                 dst_ptr[1] = SAT_CAST(a);
1661                 dst_ptr[2] = SAT_CAST(b);
1662
1663                 ++y;
1664                 dst_index += dst_step;
1665                 src_index += src_step;
1666             }
1667         }
1668     }
1669 }
1670
1671 #elif defined DEPTH_5
1672
1673 __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
1674                       __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1675 #ifdef SRGB
1676                       __global const float * gammaTab,
1677 #endif
1678                       __constant float * coeffs, float _1_3, float _a)
1679 {
1680     int x = get_global_id(0);
1681     int y = get_global_id(1) * PIX_PER_WI_Y;
1682
1683     if (x < cols)
1684     {
1685         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1686         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1687
1688         #pragma unroll
1689         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1690         {
1691             if (y < rows)
1692             {
1693                 __global const float * src = (__global const float *)(srcptr + src_index);
1694                 __global float * dst = (__global float *)(dstptr + dst_index);
1695                 float4 src_pix = vload4(0, src);
1696
1697                 float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1698                       C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1699                       C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1700
1701                 float R = clamp(src_pix.x, 0.0f, 1.0f);
1702                 float G = clamp(src_pix.y, 0.0f, 1.0f);
1703                 float B = clamp(src_pix.z, 0.0f, 1.0f);
1704
1705 #ifdef SRGB
1706                 R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1707                 G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1708                 B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1709 #endif
1710
1711                 float X = fma(R, C0, fma(G, C1, B*C2));
1712                 float Y = fma(R, C3, fma(G, C4, B*C5));
1713                 float Z = fma(R, C6, fma(G, C7, B*C8));
1714
1715                 float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
1716                 float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
1717                 float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
1718
1719                 float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
1720                 float a = 500.f * (FX - FY);
1721                 float b = 200.f * (FY - FZ);
1722
1723                 dst[0] = L;
1724                 dst[1] = a;
1725                 dst[2] = b;
1726
1727                 ++y;
1728                 dst_index += dst_step;
1729                 src_index += src_step;
1730             }
1731         }
1732     }
1733 }
1734
1735 #endif
1736
1737 inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
1738 #ifdef SRGB
1739                       __global const float * gammaTab,
1740 #endif
1741                       __constant float * coeffs, float lThresh, float fThresh)
1742 {
1743     float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
1744
1745     float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1746           C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1747           C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1748
1749     float y, fy;
1750     if (li <= lThresh)
1751     {
1752         y = li / 903.3f;
1753         fy = fma(7.787f, y, 16.0f / 116.0f);
1754     }
1755     else
1756     {
1757         fy = (li + 16.0f) / 116.0f;
1758         y = fy * fy * fy;
1759     }
1760
1761     float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
1762
1763     #pragma unroll
1764     for (int j = 0; j < 2; j++)
1765         if (fxz[j] <= fThresh)
1766             fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
1767         else
1768             fxz[j] = fxz[j] * fxz[j] * fxz[j];
1769
1770     float x = fxz[0], z = fxz[1];
1771     float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
1772     float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
1773     float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
1774
1775 #ifdef SRGB
1776     ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1777     go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1778     bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1779 #endif
1780
1781     dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
1782 }
1783
1784 #ifdef DEPTH_0
1785
1786 __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
1787                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1788 #ifdef SRGB
1789                       __global const float * gammaTab,
1790 #endif
1791                       __constant float * coeffs, float lThresh, float fThresh)
1792 {
1793     int x = get_global_id(0);
1794     int y = get_global_id(1) * PIX_PER_WI_Y;
1795
1796     if (x < cols)
1797     {
1798         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1799         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1800
1801         #pragma unroll
1802         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1803         {
1804             if (y < rows)
1805             {
1806                 __global const uchar* src_ptr = src + src_index;
1807                 __global uchar * dst_ptr = dst + dst_index;
1808                 uchar4 src_pix = vload4(0, src_ptr);
1809
1810                 float srcbuf[3], dstbuf[3];
1811                 srcbuf[0] = src_pix.x*(100.f/255.f);
1812                 srcbuf[1] = convert_float(src_pix.y - 128);
1813                 srcbuf[2] = convert_float(src_pix.z - 128);
1814
1815                 Lab2BGR_f(&srcbuf[0], &dstbuf[0],
1816 #ifdef SRGB
1817                     gammaTab,
1818 #endif
1819                     coeffs, lThresh, fThresh);
1820
1821 #if dcn == 3
1822                 dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
1823                 dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
1824                 dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
1825 #else
1826                 *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
1827                     SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
1828 #endif
1829                 ++y;
1830                 dst_index += dst_step;
1831                 src_index += src_step;
1832             }
1833         }
1834     }
1835 }
1836
1837 #elif defined DEPTH_5
1838
1839 __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
1840                       __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1841 #ifdef SRGB
1842                       __global const float * gammaTab,
1843 #endif
1844                       __constant float * coeffs, float lThresh, float fThresh)
1845 {
1846     int x = get_global_id(0);
1847     int y = get_global_id(1) * PIX_PER_WI_Y;
1848
1849     if (x < cols)
1850     {
1851         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1852         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1853
1854         #pragma unroll
1855         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1856         {
1857             if (y < rows)
1858             {
1859                 __global const float * src = (__global const float *)(srcptr + src_index);
1860                 __global float * dst = (__global float *)(dstptr + dst_index);
1861                 float4 src_pix = vload4(0, src);
1862
1863                 float srcbuf[3], dstbuf[3];
1864                 srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
1865
1866                 Lab2BGR_f(&srcbuf[0], &dstbuf[0],
1867 #ifdef SRGB
1868                     gammaTab,
1869 #endif
1870                     coeffs, lThresh, fThresh);
1871
1872                 dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
1873 #if dcn == 4
1874                 dst[3] = MAX_NUM;
1875 #endif
1876                 ++y;
1877                 dst_index += dst_step;
1878                 src_index += src_step;
1879             }
1880         }
1881     }
1882 }
1883
1884 #endif
1885
1886 /////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
1887
1888 #define LAB_CBRT_TAB_SIZE 1024
1889 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
1890
1891 __constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
1892
1893 #ifdef DEPTH_5
1894
1895 __kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
1896                       __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1897 #ifdef SRGB
1898                       __global const float * gammaTab,
1899 #endif
1900                       __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
1901 {
1902     int x = get_global_id(0);
1903     int y = get_global_id(1) * PIX_PER_WI_Y;
1904
1905     if (x < cols)
1906     {
1907         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1908         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1909
1910         #pragma unroll
1911         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1912             if (y < rows)
1913             {
1914                 __global const float * src = (__global const float *)(srcptr + src_index);
1915                 __global float * dst = (__global float *)(dstptr + dst_index);
1916
1917                 float R = src[0], G = src[1], B = src[2];
1918
1919 #ifdef SRGB
1920                 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1921                 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1922                 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1923 #endif
1924                 float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
1925                 float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
1926                 float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
1927
1928                 float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1929                 L = fma(116.f, L, -16.f);
1930
1931                 float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
1932                 float u = L*fma(X, d, -_un);
1933                 float v = L*fma(2.25f, Y*d, -_vn);
1934
1935                 dst[0] = L;
1936                 dst[1] = u;
1937                 dst[2] = v;
1938
1939                 ++y;
1940                 dst_index += dst_step;
1941                 src_index += src_step;
1942             }
1943     }
1944 }
1945
1946 #elif defined DEPTH_0
1947
1948 __kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
1949                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1950 #ifdef SRGB
1951                       __global const float * gammaTab,
1952 #endif
1953                       __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
1954 {
1955     int x = get_global_id(0);
1956     int y = get_global_id(1) * PIX_PER_WI_Y;
1957
1958     if (x < cols)
1959     {
1960         src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
1961         dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1962
1963         #pragma unroll
1964         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1965             if (y < rows)
1966             {
1967                 float scale = 1.0f / 255.0f;
1968                 float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
1969
1970 #ifdef SRGB
1971                 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1972                 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1973                 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1974 #endif
1975                 float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
1976                 float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
1977                 float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
1978
1979                 float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1980                 L = 116.f*L - 16.f;
1981
1982                 float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
1983                 float u = L*(X*d - _un);
1984                 float v = L*fma(2.25f, Y*d, -_vn);
1985
1986                 dst[0] = SAT_CAST(L * 2.55f);
1987                 dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
1988                 dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
1989
1990                 ++y;
1991                 dst += dst_step;
1992                 src += src_step;
1993             }
1994     }
1995 }
1996
1997 #endif
1998
1999 #ifdef DEPTH_5
2000
2001 __kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
2002                       __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
2003 #ifdef SRGB
2004                       __global const float * gammaTab,
2005 #endif
2006                       __constant float * coeffs, float _un, float _vn)
2007 {
2008     int x = get_global_id(0);
2009     int y = get_global_id(1) * PIX_PER_WI_Y;
2010
2011     if (x < cols)
2012     {
2013         int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
2014         int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
2015
2016         #pragma unroll
2017         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
2018             if (y < rows)
2019             {
2020                 __global const float * src = (__global const float *)(srcptr + src_index);
2021                 __global float * dst = (__global float *)(dstptr + dst_index);
2022
2023                 float L = src[0], u = src[1], v = src[2], d, X, Y, Z;
2024                 Y = (L + 16.f) * (1.f/116.f);
2025                 Y = Y*Y*Y;
2026                 d = (1.f/13.f)/L;
2027                 u = fma(u, d, _un);
2028                 v = fma(v, d, _vn);
2029                 float iv = 1.f/v;
2030                 X = 2.25f * u * Y * iv;
2031                 Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
2032
2033                 float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
2034                 float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
2035                 float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
2036
2037                 R = clamp(R, 0.f, 1.f);
2038                 G = clamp(G, 0.f, 1.f);
2039                 B = clamp(B, 0.f, 1.f);
2040
2041 #ifdef SRGB
2042                 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2043                 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2044                 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2045 #endif
2046
2047                 dst[0] = R;
2048                 dst[1] = G;
2049                 dst[2] = B;
2050 #if dcn == 4
2051                 dst[3] = MAX_NUM;
2052 #endif
2053                 ++y;
2054                 dst_index += dst_step;
2055                 src_index += src_step;
2056             }
2057     }
2058 }
2059
2060 #elif defined DEPTH_0
2061
2062 __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
2063                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
2064 #ifdef SRGB
2065                       __global const float * gammaTab,
2066 #endif
2067                       __constant float * coeffs, float _un, float _vn)
2068 {
2069     int x = get_global_id(0);
2070     int y = get_global_id(1) * PIX_PER_WI_Y;
2071
2072     if (x < cols)
2073     {
2074         src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
2075         dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
2076
2077         #pragma unroll
2078         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
2079             if (y < rows)
2080             {
2081                 float d, X, Y, Z;
2082                 float L = src[0]*(100.f/255.f);
2083                 float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
2084                 float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
2085                 Y = (L + 16.f) * (1.f/116.f);
2086                 Y = Y*Y*Y;
2087                 d = (1.f/13.f)/L;
2088                 u = fma(u, d, _un);
2089                 v = fma(v, d, _vn);
2090                 float iv = 1.f/v;
2091                 X = 2.25f * u * Y * iv ;
2092                 Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
2093
2094                 float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
2095                 float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
2096                 float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
2097
2098                 R = clamp(R, 0.f, 1.f);
2099                 G = clamp(G, 0.f, 1.f);
2100                 B = clamp(B, 0.f, 1.f);
2101
2102 #ifdef SRGB
2103                 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2104                 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2105                 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2106 #endif
2107
2108                 uchar dst0 = SAT_CAST(R * 255.0f);
2109                 uchar dst1 = SAT_CAST(G * 255.0f);
2110                 uchar dst2 = SAT_CAST(B * 255.0f);
2111
2112 #if dcn == 4
2113                 *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
2114 #else
2115                 dst[0] = dst0;
2116                 dst[1] = dst1;
2117                 dst[2] = dst2;
2118 #endif
2119
2120                 ++y;
2121                 dst += dst_step;
2122                 src += src_step;
2123             }
2124     }
2125 }
2126
2127 #endif