1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Jia Haipeng, jiahaipeng95@gmail.com
19 // Peng Xiao, pengxiao@multicorewareinc.com
21 // Redistribution and use in source and binary forms, with or without modification,
22 // are permitted provided that the following conditions are met:
24 // * Redistribution's of source code must retain the above copyright notice,
25 // this list of conditions and the following disclaimer.
27 // * Redistribution's in binary form must reproduce the above copyright notice,
28 // this list of conditions and the following disclaimer in the documentation
29 // and/or other materials provided with the distribution.
31 // * The name of the copyright holders may not be used to endorse or promote products
32 // derived from this software without specific prior written permission.
34 // This software is provided by the copyright holders and contributors as is and
35 // any express or implied warranties, including, but not limited to, the implied
36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
37 // In no event shall the Intel Corporation or contributors be liable for any direct,
38 // indirect, incidental, special, exemplary, or consequential damages
39 // (including, but not limited to, procurement of substitute goods or services;
40 // loss of use, data, or profits; or business interruption) however caused
41 // and on any theory of liability, whether in contract, strict liability,
42 // or tort (including negligence or otherwise) arising in any way out of
43 // the use of this software, even if advised of the possibility of such damage.
47 /**************************************PUBLICFUNC*************************************/
50 #define DATA_TYPE uchar
53 #define COEFF_TYPE int
54 #define SAT_CAST(num) convert_uchar_sat(num)
57 #define DATA_TYPE ushort
59 #define HALF_MAX 32768
60 #define COEFF_TYPE int
61 #define SAT_CAST(num) convert_ushort_sat(num)
64 #define DATA_TYPE float
67 #define COEFF_TYPE float
68 #define SAT_CAST(num) (num)
71 #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
74 #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
87 #define scnbytes ((int)sizeof(DATA_TYPE)*scn)
88 #define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
107 // The only kernel that uses bidx == 3 doesn't use these macros.
108 // But we still need to make the compiler happy.
123 #define PIX_PER_WI_X 1
126 #define __CAT(x, y) x##y
127 #define CAT(x, y) __CAT(x, y)
129 #define DATA_TYPE_4 CAT(DATA_TYPE, 4)
131 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
133 __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
134 __global uchar * dstptr, int dst_step, int dst_offset,
137 int x = get_global_id(0);
138 int y = get_global_id(1) * PIX_PER_WI_Y;
142 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
143 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
146 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
150 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
151 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
152 DATA_TYPE_4 src_pix = vload4(0, src);
154 dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));
156 dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
159 src_index += src_step;
160 dst_index += dst_step;
166 __kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
167 __global uchar * dstptr, int dst_step, int dst_offset,
170 int x = get_global_id(0);
171 int y = get_global_id(1) * PIX_PER_WI_Y;
175 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
176 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
179 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
183 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
184 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
185 DATA_TYPE val = src[0];
186 #if dcn == 3 || defined DEPTH_5
187 dst[0] = dst[1] = dst[2] = val;
192 *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
195 dst_index += dst_step;
196 src_index += src_step;
202 ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
204 __constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
205 __constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
207 __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
208 __global uchar* dstptr, int dst_step, int dt_offset,
211 int x = get_global_id(0);
212 int y = get_global_id(1) * PIX_PER_WI_Y;
216 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
217 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
220 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
224 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
225 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
226 DATA_TYPE_4 src_pix = vload4(0, src);
227 DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
230 __constant float * coeffs = c_RGB2YUVCoeffs_f;
231 const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
232 const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX);
233 const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX);
235 __constant int * coeffs = c_RGB2YUVCoeffs_i;
236 const int delta = HALF_MAX * (1 << yuv_shift);
237 const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
238 const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
239 const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
242 dst[0] = SAT_CAST( Y );
243 dst[1] = SAT_CAST( U );
244 dst[2] = SAT_CAST( V );
247 dst_index += dst_step;
248 src_index += src_step;
254 __constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };
255 __constant int c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };
257 __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
258 __global uchar* dstptr, int dst_step, int dt_offset,
261 int x = get_global_id(0);
262 int y = get_global_id(1) * PIX_PER_WI_Y;
266 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
267 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
270 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
274 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
275 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
276 DATA_TYPE_4 src_pix = vload4(0, src);
277 DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
280 __constant float * coeffs = c_YUV2RGBCoeffs_f;
281 float r = fma(V - HALF_MAX, coeffs[3], Y);
282 float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y));
283 float b = fma(U - HALF_MAX, coeffs[0], Y);
285 __constant int * coeffs = c_YUV2RGBCoeffs_i;
286 const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift);
287 const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift);
288 const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift);
291 dst[bidx] = SAT_CAST( b );
292 dst[1] = SAT_CAST( g );
293 dst[bidx^2] = SAT_CAST( r );
298 dst_index += dst_step;
299 src_index += src_step;
304 __constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,
305 -0.812999725f, 1.5959997177f };
307 __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,
308 __global uchar* dstptr, int dst_step, int dt_offset,
311 int x = get_global_id(0);
312 int y = get_global_id(1) * PIX_PER_WI_Y;
317 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
321 __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
322 __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
323 __global uchar* dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
324 __global uchar* dst2 = dst1 + dst_step;
328 float Y3 = ysrc[src_step];
329 float Y4 = ysrc[src_step + 1];
331 float U = ((float)usrc[uidx]) - HALF_MAX;
332 float V = ((float)usrc[1-uidx]) - HALF_MAX;
334 __constant float* coeffs = c_YUV2RGBCoeffs_420;
335 float ruv = fma(coeffs[4], V, 0.5f);
336 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
337 float buv = fma(coeffs[1], U, 0.5f);
339 Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
340 dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
341 dst1[1] = convert_uchar_sat(Y1 + guv);
342 dst1[bidx] = convert_uchar_sat(Y1 + buv);
347 Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
348 dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
349 dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
350 dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
355 Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
356 dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
357 dst2[1] = convert_uchar_sat(Y3 + guv);
358 dst2[bidx] = convert_uchar_sat(Y3 + buv);
363 Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
364 dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
365 dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
366 dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
376 __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
377 __global uchar* dstptr, int dst_step, int dt_offset,
380 int x = get_global_id(0);
381 int y = get_global_id(1) * PIX_PER_WI_Y;
386 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
390 __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
391 __global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
392 __global uchar* dst2 = dst1 + dst_step;
396 float Y3 = ysrc[src_step];
397 float Y4 = ysrc[src_step + 1];
400 __global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
401 int u_ind = mad24(y, cols >> 1, x);
402 float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX };
404 int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
405 __global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
406 __global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
407 float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX };
410 float V = uv[1-uidx];
412 __constant float* coeffs = c_YUV2RGBCoeffs_420;
413 float ruv = fma(coeffs[4], V, 0.5f);
414 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
415 float buv = fma(coeffs[1], U, 0.5f);
417 Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
418 dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
419 dst1[1] = convert_uchar_sat(Y1 + guv);
420 dst1[bidx] = convert_uchar_sat(Y1 + buv);
425 Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
426 dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
427 dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
428 dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
433 Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
434 dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
435 dst2[1] = convert_uchar_sat(Y3 + guv);
436 dst2[bidx] = convert_uchar_sat(Y3 + buv);
441 Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
442 dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
443 dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
444 dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
454 __constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
455 0.438999176f, -0.3679990768f, -0.0709991455f };
457 __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
458 __global uchar* dstptr, int dst_step, int dst_offset,
461 int x = get_global_id(0) * PIX_PER_WI_X;
462 int y = get_global_id(1) * PIX_PER_WI_Y;
466 int src_index = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
467 int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
468 int y_rows = rows / 3 * 2;
469 int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
470 __constant float* coeffs = c_RGB2YUVCoeffs_420;
473 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
477 __global const uchar* src1 = srcptr + src_index;
478 __global const uchar* src2 = src1 + src_step;
479 __global uchar* ydst1 = dstptr + ydst_index;
480 __global uchar* ydst2 = ydst1 + dst_step;
482 __global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
483 __global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
485 #if PIX_PER_WI_X == 2
486 int s11 = *((__global const int*) src1);
487 int s12 = *((__global const int*) src1 + 1);
488 int s13 = *((__global const int*) src1 + 2);
490 int s14 = *((__global const int*) src1 + 3);
492 int s21 = *((__global const int*) src2);
493 int s22 = *((__global const int*) src2 + 1);
494 int s23 = *((__global const int*) src2 + 2);
496 int s24 = *((__global const int*) src2 + 3);
498 float src_pix1[scn * 4], src_pix2[scn * 4];
500 *((float4*) src_pix1) = convert_float4(as_uchar4(s11));
501 *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
502 *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
504 *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
506 *((float4*) src_pix2) = convert_float4(as_uchar4(s21));
507 *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
508 *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
510 *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
513 y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f))));
514 y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f))));
515 y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
516 y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
517 y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f))));
518 y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f))));
519 y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
520 y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
522 *((__global int*) ydst1) = as_int(y1);
523 *((__global int*) ydst2) = as_int(y2);
525 float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))),
526 fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))),
527 fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
528 fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
530 udst[0] = convert_uchar_sat(uv[uidx] );
531 vdst[0] = convert_uchar_sat(uv[1 - uidx]);
532 udst[1] = convert_uchar_sat(uv[2 + uidx]);
533 vdst[1] = convert_uchar_sat(uv[3 - uidx]);
535 float4 src_pix1 = convert_float4(vload4(0, src1));
536 float4 src_pix2 = convert_float4(vload4(0, src1+scn));
537 float4 src_pix3 = convert_float4(vload4(0, src2));
538 float4 src_pix4 = convert_float4(vload4(0, src2+scn));
540 ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
541 ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
542 ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
543 ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));
545 float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
546 fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
548 udst[0] = convert_uchar_sat(uv[uidx] );
549 vdst[0] = convert_uchar_sat(uv[1-uidx]);
552 src_index += 2*src_step;
553 ydst_index += 2*dst_step;
559 __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
560 __global uchar* dstptr, int dst_step, int dst_offset,
563 int x = get_global_id(0);
564 int y = get_global_id(1) * PIX_PER_WI_Y;
568 __global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
569 __global uchar* dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
572 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
576 float U = ((float) src[uidx]) - HALF_MAX;
577 float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;
579 __constant float* coeffs = c_YUV2RGBCoeffs_420;
580 float ruv = fma(coeffs[4], V, 0.5f);
581 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
582 float buv = fma(coeffs[1], U, 0.5f);
584 float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
585 dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
586 dst[1] = convert_uchar_sat(y00 + guv);
587 dst[bidx] = convert_uchar_sat(y00 + buv);
591 float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
592 dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
593 dst[dcn + 1] = convert_uchar_sat(y01 + guv);
594 dst[dcn + bidx] = convert_uchar_sat(y01 + buv);
606 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
608 __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
609 __constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
611 __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
612 __global uchar* dstptr, int dst_step, int dt_offset,
615 int x = get_global_id(0);
616 int y = get_global_id(1) * PIX_PER_WI_Y;
620 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
621 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
624 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
628 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
629 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
630 DATA_TYPE_4 src_pix = vload4(0, src);
631 DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
634 __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
635 DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
636 DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX);
637 DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX);
639 __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
640 int delta = HALF_MAX * (1 << yuv_shift);
641 int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
642 int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
643 int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
646 dst[0] = SAT_CAST( Y );
647 dst[1] = SAT_CAST( Cr );
648 dst[2] = SAT_CAST( Cb );
651 dst_index += dst_step;
652 src_index += src_step;
658 __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
659 __constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
661 __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
662 __global uchar* dst, int dst_step, int dst_offset,
665 int x = get_global_id(0);
666 int y = get_global_id(1) * PIX_PER_WI_Y;
670 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
671 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
674 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
678 __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);
679 __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);
681 DATA_TYPE_4 src_pix = vload4(0, srcptr);
682 DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;
685 __constant float * coeff = c_YCrCb2RGBCoeffs_f;
686 float r = fma(coeff[0], cr - HALF_MAX, yp);
687 float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, yp));
688 float b = fma(coeff[3], cb - HALF_MAX, yp);
690 __constant int * coeff = c_YCrCb2RGBCoeffs_i;
691 int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
692 int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift);
693 int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
696 dstptr[(bidx^2)] = SAT_CAST(r);
697 dstptr[1] = SAT_CAST(g);
698 dstptr[bidx] = SAT_CAST(b);
704 dst_index += dst_step;
705 src_index += src_step;
711 ///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
713 __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
714 __global uchar * dstptr, int dst_step, int dst_offset,
715 int rows, int cols, __constant COEFF_TYPE * coeffs)
717 int dx = get_global_id(0);
718 int dy = get_global_id(1) * PIX_PER_WI_Y;
722 int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
723 int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
726 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
730 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
731 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
733 DATA_TYPE_4 src_pix = vload4(0, src);
734 DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
737 float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
738 float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
739 float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
741 int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
742 int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
743 int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
745 dst[0] = SAT_CAST(x);
746 dst[1] = SAT_CAST(y);
747 dst[2] = SAT_CAST(z);
750 dst_index += dst_step;
751 src_index += src_step;
757 __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
758 __global uchar * dstptr, int dst_step, int dst_offset,
759 int rows, int cols, __constant COEFF_TYPE * coeffs)
761 int dx = get_global_id(0);
762 int dy = get_global_id(1) * PIX_PER_WI_Y;
766 int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
767 int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
770 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
774 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
775 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
777 DATA_TYPE_4 src_pix = vload4(0, src);
778 DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
781 float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
782 float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
783 float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
785 int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
786 int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
787 int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
790 DATA_TYPE dst0 = SAT_CAST(b);
791 DATA_TYPE dst1 = SAT_CAST(g);
792 DATA_TYPE dst2 = SAT_CAST(r);
793 #if dcn == 3 || defined DEPTH_5
801 *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
805 dst_index += dst_step;
806 src_index += src_step;
812 ///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
814 __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
815 __global uchar* dstptr, int dst_step, int dst_offset,
818 int x = get_global_id(0);
819 int y = get_global_id(1) * PIX_PER_WI_Y;
823 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
824 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
827 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
831 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
832 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
833 DATA_TYPE_4 src_pix = vload4(0, src);
854 dst_index += dst_step;
855 src_index += src_step;
861 ///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
863 __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,
864 __global uchar* dst, int dst_step, int dst_offset,
867 int x = get_global_id(0);
868 int y = get_global_id(1) * PIX_PER_WI_Y;
872 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
873 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
876 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
880 ushort t = *((__global const ushort*)(src + src_index));
883 dst[dst_index + bidx] = (uchar)(t << 3);
884 dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
885 dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
887 dst[dst_index + bidx] = (uchar)(t << 3);
888 dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
889 dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
894 dst[dst_index + 3] = 255;
896 dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
901 dst_index += dst_step;
902 src_index += src_step;
908 __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,
909 __global uchar* dst, int dst_step, int dst_offset,
912 int x = get_global_id(0);
913 int y = get_global_id(1) * PIX_PER_WI_Y;
917 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
918 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
921 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
925 uchar4 src_pix = vload4(0, src + src_index);
928 *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
930 *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
932 *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
933 ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
937 dst_index += dst_step;
938 src_index += src_step;
944 ///////////////////////////////////// RGB5x5 <-> Gray //////////////////////////////////////
946 __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,
947 __global uchar* dst, int dst_step, int dst_offset,
950 int x = get_global_id(0);
951 int y = get_global_id(1) * PIX_PER_WI_Y;
955 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
956 int dst_index = mad24(y, dst_step, dst_offset + x);
959 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
963 int t = *((__global const ushort*)(src + src_index));
966 dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
968 dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
971 dst_index += dst_step;
972 src_index += src_step;
978 __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,
979 __global uchar* dst, int dst_step, int dst_offset,
982 int x = get_global_id(0);
983 int y = get_global_id(1) * PIX_PER_WI_Y;
987 int src_index = mad24(y, src_step, src_offset + x);
988 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
991 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
995 int t = src[src_index];
998 *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
1001 *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
1004 dst_index += dst_step;
1005 src_index += src_step;
1011 //////////////////////////////////// RGB <-> HSV //////////////////////////////////////
1013 __constant int sector_data[][3] = { { 1, 3, 0 },
1022 __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
1023 __global uchar* dst, int dst_step, int dst_offset,
1025 __constant int * sdiv_table, __constant int * hdiv_table)
1027 int x = get_global_id(0);
1028 int y = get_global_id(1) * PIX_PER_WI_Y;
1032 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1033 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1036 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1040 uchar4 src_pix = vload4(0, src + src_index);
1042 int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1049 vmin = min(vmin, g);
1050 vmin = min(vmin, r);
1053 vr = v == r ? -1 : 0;
1054 vg = v == g ? -1 : 0;
1056 s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
1057 h = (vr & (g - b)) +
1058 (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
1059 h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
1060 h += h < 0 ? hrange : 0;
1062 dst[dst_index] = convert_uchar_sat_rte(h);
1063 dst[dst_index + 1] = (uchar)s;
1064 dst[dst_index + 2] = (uchar)v;
1067 dst_index += dst_step;
1068 src_index += src_step;
1074 __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
1075 __global uchar* dst, int dst_step, int dst_offset,
1078 int x = get_global_id(0);
1079 int y = get_global_id(1) * PIX_PER_WI_Y;
1083 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1084 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1087 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1091 uchar4 src_pix = vload4(0, src + src_index);
1093 float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
1102 do h += 6; while( h < 0 );
1104 do h -= 6; while( h >= 6 );
1105 sector = convert_int_sat_rtn(h);
1107 if( (unsigned)sector >= 6u )
1114 tab[1] = v*(1.f - s);
1115 tab[2] = v*(1.f - s*h);
1116 tab[3] = v*(1.f - s*(1.f - h));
1118 b = tab[sector_data[sector][0]];
1119 g = tab[sector_data[sector][1]];
1120 r = tab[sector_data[sector][2]];
1125 dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
1126 dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
1127 dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
1129 dst[dst_index + 3] = MAX_NUM;
1133 dst_index += dst_step;
1134 src_index += src_step;
1140 #elif defined DEPTH_5
1142 __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,
1143 __global uchar* dstptr, int dst_step, int dst_offset,
1146 int x = get_global_id(0);
1147 int y = get_global_id(1) * PIX_PER_WI_Y;
1151 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1152 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1155 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1159 __global const float * src = (__global const float *)(srcptr + src_index);
1160 __global float * dst = (__global float *)(dstptr + dst_index);
1161 float4 src_pix = vload4(0, src);
1163 float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1171 if( vmin > g ) vmin = g;
1172 if( vmin > b ) vmin = b;
1175 s = diff/(float)(fabs(v) + FLT_EPSILON);
1176 diff = (float)(60.f/(diff + FLT_EPSILON));
1180 h = fma(b - r, diff, 120.f);
1182 h = fma(r - g, diff, 240.f);
1192 dst_index += dst_step;
1193 src_index += src_step;
1199 __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
1200 __global uchar* dstptr, int dst_step, int dst_offset,
1203 int x = get_global_id(0);
1204 int y = get_global_id(1) * PIX_PER_WI_Y;
1208 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1209 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1212 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1217 __global const float * src = (__global const float *)(srcptr + src_index);
1218 __global float * dst = (__global float *)(dstptr + dst_index);
1219 float4 src_pix = vload4(0, src);
1221 float h = src_pix.x, s = src_pix.y, v = src_pix.z;
1230 do h += 6; while (h < 0);
1232 do h -= 6; while (h >= 6);
1233 sector = convert_int_sat_rtn(h);
1235 if ((unsigned)sector >= 6u)
1242 tab[1] = v*(1.f - s);
1243 tab[2] = v*(1.f - s*h);
1244 tab[3] = v*(1.f - s*(1.f - h));
1246 b = tab[sector_data[sector][0]];
1247 g = tab[sector_data[sector][1]];
1248 r = tab[sector_data[sector][2]];
1261 dst_index += dst_step;
1262 src_index += src_step;
1270 ///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
1274 __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
1275 __global uchar* dst, int dst_step, int dst_offset,
1278 int x = get_global_id(0);
1279 int y = get_global_id(1) * PIX_PER_WI_Y;
1283 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1284 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1287 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1291 uchar4 src_pix = vload4(0, src + src_index);
1293 float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
1294 float h = 0.f, s = 0.f, l;
1295 float vmin, vmax, diff;
1298 if (vmax < g) vmax = g;
1299 if (vmax < b) vmax = b;
1300 if (vmin > g) vmin = g;
1301 if (vmin > b) vmin = b;
1304 l = (vmax + vmin)*0.5f;
1306 if (diff > FLT_EPSILON)
1308 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1313 else if( vmax == g )
1314 h = fma(b - r, diff, 120.f);
1316 h = fma(r - g, diff, 240.f);
1322 dst[dst_index] = convert_uchar_sat_rte(h*hscale);
1323 dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
1324 dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
1327 dst_index += dst_step;
1328 src_index += src_step;
1334 __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
1335 __global uchar* dst, int dst_step, int dst_offset,
1338 int x = get_global_id(0);
1339 int y = get_global_id(1) * PIX_PER_WI_Y;
1343 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1344 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1347 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1351 uchar4 src_pix = vload4(0, src + src_index);
1353 float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
1360 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1361 float p1 = 2*l - p2;
1365 do h += 6; while( h < 0 );
1367 do h -= 6; while( h >= 6 );
1369 int sector = convert_int_sat_rtn(h);
1374 tab[2] = fma(p2 - p1, 1-h, p1);
1375 tab[3] = fma(p2 - p1, h, p1);
1377 b = tab[sector_data[sector][0]];
1378 g = tab[sector_data[sector][1]];
1379 r = tab[sector_data[sector][2]];
1384 dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
1385 dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
1386 dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
1388 dst[dst_index + 3] = MAX_NUM;
1392 dst_index += dst_step;
1393 src_index += src_step;
1399 #elif defined DEPTH_5
1401 __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,
1402 __global uchar* dstptr, int dst_step, int dst_offset,
1405 int x = get_global_id(0);
1406 int y = get_global_id(1) * PIX_PER_WI_Y;
1410 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1411 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1414 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1418 __global const float * src = (__global const float *)(srcptr + src_index);
1419 __global float * dst = (__global float *)(dstptr + dst_index);
1420 float4 src_pix = vload4(0, src);
1422 float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
1423 float h = 0.f, s = 0.f, l;
1424 float vmin, vmax, diff;
1427 if (vmax < g) vmax = g;
1428 if (vmax < b) vmax = b;
1429 if (vmin > g) vmin = g;
1430 if (vmin > b) vmin = b;
1433 l = (vmax + vmin)*0.5f;
1435 if (diff > FLT_EPSILON)
1437 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
1442 else if( vmax == g )
1443 h = fma(b - r, diff, 120.f);
1445 h = fma(r - g, diff, 240.f);
1447 if( h < 0.f ) h += 360.f;
1455 dst_index += dst_step;
1456 src_index += src_step;
1462 __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,
1463 __global uchar* dstptr, int dst_step, int dst_offset,
1466 int x = get_global_id(0);
1467 int y = get_global_id(1) * PIX_PER_WI_Y;
1471 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1472 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1475 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1479 __global const float * src = (__global const float *)(srcptr + src_index);
1480 __global float * dst = (__global float *)(dstptr + dst_index);
1481 float4 src_pix = vload4(0, src);
1483 float h = src_pix.x, l = src_pix.y, s = src_pix.z;
1491 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
1492 float p1 = 2*l - p2;
1496 do h += 6; while( h < 0 );
1498 do h -= 6; while( h >= 6 );
1500 sector = convert_int_sat_rtn(h);
1505 tab[2] = fma(p2 - p1, 1-h, p1);
1506 tab[3] = fma(p2 - p1, h, p1);
1508 b = tab[sector_data[sector][0]];
1509 g = tab[sector_data[sector][1]];
1510 r = tab[sector_data[sector][2]];
1523 dst_index += dst_step;
1524 src_index += src_step;
1532 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
1536 __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,
1537 __global uchar* dst, int dst_step, int dst_offset,
1540 int x = get_global_id(0);
1541 int y = get_global_id(1) * PIX_PER_WI_Y;
1545 int src_index = mad24(y, src_step, src_offset + (x << 2));
1546 int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
1549 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1553 uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
1555 *(__global uchar4 *)(dst + dst_index) =
1556 (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM,
1557 mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM,
1558 mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w);
1561 dst_index += dst_step;
1562 src_index += src_step;
1568 __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,
1569 __global uchar* dst, int dst_step, int dst_offset,
1572 int x = get_global_id(0);
1573 int y = get_global_id(1) * PIX_PER_WI_Y;
1577 int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
1578 int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
1581 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1585 uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
1586 uchar v3 = src_pix.w, v3_half = v3 / 2;
1589 *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
1591 *(__global uchar4 *)(dst + dst_index) =
1592 (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
1593 mad24(src_pix.y, MAX_NUM, v3_half) / v3,
1594 mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
1597 dst_index += dst_step;
1598 src_index += src_step;
1606 /////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
1608 #define lab_shift xyz_shift
1609 #define gamma_shift 3
1610 #define lab_shift2 (lab_shift + gamma_shift)
1611 #define GAMMA_TAB_SIZE 1024
1612 #define GammaTabScale (float)GAMMA_TAB_SIZE
1614 inline float splineInterpolate(float x, __global const float * tab, int n)
1616 int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
1619 return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
1624 __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
1625 __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1626 __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
1627 __constant int * coeffs, int Lscale, int Lshift)
1629 int x = get_global_id(0);
1630 int y = get_global_id(1) * PIX_PER_WI_Y;
1634 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1635 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1638 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1642 __global const uchar* src_ptr = src + src_index;
1643 __global uchar* dst_ptr = dst + dst_index;
1644 uchar4 src_pix = vload4(0, src_ptr);
1646 int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1647 C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1648 C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1650 int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
1651 int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
1652 int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
1653 int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
1655 int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
1656 int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
1657 int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
1659 dst_ptr[0] = SAT_CAST(L);
1660 dst_ptr[1] = SAT_CAST(a);
1661 dst_ptr[2] = SAT_CAST(b);
1664 dst_index += dst_step;
1665 src_index += src_step;
1671 #elif defined DEPTH_5
1673 __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
1674 __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1676 __global const float * gammaTab,
1678 __constant float * coeffs, float _1_3, float _a)
1680 int x = get_global_id(0);
1681 int y = get_global_id(1) * PIX_PER_WI_Y;
1685 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1686 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1689 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1693 __global const float * src = (__global const float *)(srcptr + src_index);
1694 __global float * dst = (__global float *)(dstptr + dst_index);
1695 float4 src_pix = vload4(0, src);
1697 float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1698 C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1699 C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1701 float R = clamp(src_pix.x, 0.0f, 1.0f);
1702 float G = clamp(src_pix.y, 0.0f, 1.0f);
1703 float B = clamp(src_pix.z, 0.0f, 1.0f);
1706 R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1707 G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1708 B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1711 float X = fma(R, C0, fma(G, C1, B*C2));
1712 float Y = fma(R, C3, fma(G, C4, B*C5));
1713 float Z = fma(R, C6, fma(G, C7, B*C8));
1715 float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
1716 float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
1717 float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
1719 float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
1720 float a = 500.f * (FX - FY);
1721 float b = 200.f * (FY - FZ);
1728 dst_index += dst_step;
1729 src_index += src_step;
1737 inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
1739 __global const float * gammaTab,
1741 __constant float * coeffs, float lThresh, float fThresh)
1743 float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
1745 float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
1746 C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
1747 C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
1753 fy = fma(7.787f, y, 16.0f / 116.0f);
1757 fy = (li + 16.0f) / 116.0f;
1761 float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
1764 for (int j = 0; j < 2; j++)
1765 if (fxz[j] <= fThresh)
1766 fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
1768 fxz[j] = fxz[j] * fxz[j] * fxz[j];
1770 float x = fxz[0], z = fxz[1];
1771 float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
1772 float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
1773 float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
1776 ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1777 go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1778 bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1781 dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
1786 __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
1787 __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1789 __global const float * gammaTab,
1791 __constant float * coeffs, float lThresh, float fThresh)
1793 int x = get_global_id(0);
1794 int y = get_global_id(1) * PIX_PER_WI_Y;
1798 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1799 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1802 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1806 __global const uchar* src_ptr = src + src_index;
1807 __global uchar * dst_ptr = dst + dst_index;
1808 uchar4 src_pix = vload4(0, src_ptr);
1810 float srcbuf[3], dstbuf[3];
1811 srcbuf[0] = src_pix.x*(100.f/255.f);
1812 srcbuf[1] = convert_float(src_pix.y - 128);
1813 srcbuf[2] = convert_float(src_pix.z - 128);
1815 Lab2BGR_f(&srcbuf[0], &dstbuf[0],
1819 coeffs, lThresh, fThresh);
1822 dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
1823 dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
1824 dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
1826 *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
1827 SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
1830 dst_index += dst_step;
1831 src_index += src_step;
1837 #elif defined DEPTH_5
1839 __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
1840 __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1842 __global const float * gammaTab,
1844 __constant float * coeffs, float lThresh, float fThresh)
1846 int x = get_global_id(0);
1847 int y = get_global_id(1) * PIX_PER_WI_Y;
1851 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1852 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1855 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1859 __global const float * src = (__global const float *)(srcptr + src_index);
1860 __global float * dst = (__global float *)(dstptr + dst_index);
1861 float4 src_pix = vload4(0, src);
1863 float srcbuf[3], dstbuf[3];
1864 srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
1866 Lab2BGR_f(&srcbuf[0], &dstbuf[0],
1870 coeffs, lThresh, fThresh);
1872 dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
1877 dst_index += dst_step;
1878 src_index += src_step;
1886 /////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
1888 #define LAB_CBRT_TAB_SIZE 1024
1889 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
1891 __constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
1895 __kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
1896 __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
1898 __global const float * gammaTab,
1900 __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
1902 int x = get_global_id(0);
1903 int y = get_global_id(1) * PIX_PER_WI_Y;
1907 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
1908 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1911 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1914 __global const float * src = (__global const float *)(srcptr + src_index);
1915 __global float * dst = (__global float *)(dstptr + dst_index);
1917 float R = src[0], G = src[1], B = src[2];
1920 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1921 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1922 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1924 float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
1925 float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
1926 float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
1928 float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1929 L = fma(116.f, L, -16.f);
1931 float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
1932 float u = L*fma(X, d, -_un);
1933 float v = L*fma(2.25f, Y*d, -_vn);
1940 dst_index += dst_step;
1941 src_index += src_step;
1946 #elif defined DEPTH_0
1948 __kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
1949 __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
1951 __global const float * gammaTab,
1953 __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
1955 int x = get_global_id(0);
1956 int y = get_global_id(1) * PIX_PER_WI_Y;
1960 src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
1961 dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
1964 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
1967 float scale = 1.0f / 255.0f;
1968 float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
1971 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1972 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1973 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
1975 float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
1976 float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
1977 float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
1979 float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
1982 float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
1983 float u = L*(X*d - _un);
1984 float v = L*fma(2.25f, Y*d, -_vn);
1986 dst[0] = SAT_CAST(L * 2.55f);
1987 dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
1988 dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
2001 __kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
2002 __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
2004 __global const float * gammaTab,
2006 __constant float * coeffs, float _un, float _vn)
2008 int x = get_global_id(0);
2009 int y = get_global_id(1) * PIX_PER_WI_Y;
2013 int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
2014 int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
2017 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
2020 __global const float * src = (__global const float *)(srcptr + src_index);
2021 __global float * dst = (__global float *)(dstptr + dst_index);
2023 float L = src[0], u = src[1], v = src[2], d, X, Y, Z;
2024 Y = (L + 16.f) * (1.f/116.f);
2030 X = 2.25f * u * Y * iv;
2031 Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
2033 float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
2034 float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
2035 float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
2037 R = clamp(R, 0.f, 1.f);
2038 G = clamp(G, 0.f, 1.f);
2039 B = clamp(B, 0.f, 1.f);
2042 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2043 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2044 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2054 dst_index += dst_step;
2055 src_index += src_step;
2060 #elif defined DEPTH_0
2062 __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
2063 __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
2065 __global const float * gammaTab,
2067 __constant float * coeffs, float _un, float _vn)
2069 int x = get_global_id(0);
2070 int y = get_global_id(1) * PIX_PER_WI_Y;
2074 src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
2075 dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
2078 for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
2082 float L = src[0]*(100.f/255.f);
2083 float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
2084 float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
2085 Y = (L + 16.f) * (1.f/116.f);
2091 X = 2.25f * u * Y * iv ;
2092 Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
2094 float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
2095 float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
2096 float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
2098 R = clamp(R, 0.f, 1.f);
2099 G = clamp(G, 0.f, 1.f);
2100 B = clamp(B, 0.f, 1.f);
2103 R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2104 G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2105 B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
2108 uchar dst0 = SAT_CAST(R * 255.0f);
2109 uchar dst1 = SAT_CAST(G * 255.0f);
2110 uchar dst2 = SAT_CAST(B * 255.0f);
2113 *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);