#define hrange 0
#endif
+#if bidx == 0
+#define R_COMP z
+#define G_COMP y
+#define B_COMP x
+#elif bidx == 2
+#define R_COMP x
+#define G_COMP y
+#define B_COMP z
+#elif bidx == 3
+// The only kernel that uses bidx == 3 doesn't use these macros.
+// But we still need to make the compiler happy.
+#define R_COMP w
+#define G_COMP w
+#define B_COMP w
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
-#if 1
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- if (y < rows && x < cols)
- {
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
-#ifdef DEPTH_5
- dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
-#else
- dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift);
-#endif
- }
-#else
- const int x_min = get_global_id(0)*STRIPE_SIZE;
- const int x_max = min(x_min + STRIPE_SIZE, cols);
- const int y = get_global_id(1);
+ int x = get_global_id(0);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if( y < rows )
+ if (x < cols)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr +
- mad24(y, srcstep, srcoffset)) + x_min*scn;
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset));
- int x;
- for( x = x_min; x < x_max; x++, src += scn )
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ DATA_TYPE_4 src_pix = vload4(0, src);
#ifdef DEPTH_5
- dst[x] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
+ dst[0] = src_pix.B_COMP * 0.114f + src_pix.G_COMP * 0.587f + src_pix.R_COMP * 0.299f;
#else
- dst[x] = (DATA_TYPE)(mad24(src[bidx], B2Y, mad24(src[1], G2Y,
- mad24(src[(bidx^2)], R2Y, 1 << (yuv_shift-1)))) >> yuv_shift);
+ dst[0] = (DATA_TYPE)CV_DESCALE((src_pix.B_COMP * B2Y + src_pix.G_COMP * G2Y + src_pix.R_COMP * R2Y), yuv_shift);
#endif
+ }
+ ++y;
+ }
}
-#endif
}
__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols)
{
- const int x = get_global_id(0);
- const int y = get_global_id(1);
+ int x = get_global_id(0);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
- DATA_TYPE val = src[0];
- dst[0] = dst[1] = dst[2] = val;
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ DATA_TYPE val = src[0];
+ dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
- DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ DATA_TYPE_4 src_pix = vload4(0, src);
+ DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
#ifdef DEPTH_5
- __constant float * coeffs = c_RGB2YUVCoeffs_f;
- const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
- const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
- const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
+ __constant float * coeffs = c_RGB2YUVCoeffs_f;
+ const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
+ const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
+ const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
#else
- __constant int * coeffs = c_RGB2YUVCoeffs_i;
- const int delta = HALF_MAX * (1 << yuv_shift);
- const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
- const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
- const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
+ __constant int * coeffs = c_RGB2YUVCoeffs_i;
+ const int delta = HALF_MAX * (1 << yuv_shift);
+ const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
+ const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
+ const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
#endif
- dst[0] = SAT_CAST( Y );
- dst[1] = SAT_CAST( U );
- dst[2] = SAT_CAST( V );
+ dst[0] = SAT_CAST( Y );
+ dst[1] = SAT_CAST( U );
+ dst[2] = SAT_CAST( V );
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
- DATA_TYPE Y = src[0], U = src[1], V = src[2];
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ DATA_TYPE_4 src_pix = vload4(0, src);
+ DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
#ifdef DEPTH_5
- __constant float * coeffs = c_YUV2RGBCoeffs_f;
- const float r = Y + (V - HALF_MAX) * coeffs[3];
- const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
- const float b = Y + (U - HALF_MAX) * coeffs[0];
+ __constant float * coeffs = c_YUV2RGBCoeffs_f;
+ const float r = Y + (V - HALF_MAX) * coeffs[3];
+ const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
+ const float b = Y + (U - HALF_MAX) * coeffs[0];
#else
- __constant int * coeffs = c_YUV2RGBCoeffs_i;
- const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
- const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
- const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
+ __constant int * coeffs = c_YUV2RGBCoeffs_i;
+ const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
+ const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+ const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
#endif
- dst[bidx] = SAT_CAST( b );
- dst[1] = SAT_CAST( g );
- dst[bidx^2] = SAT_CAST( r );
+ dst[bidx] = SAT_CAST( b );
+ dst[1] = SAT_CAST( g );
+ dst[bidx^2] = SAT_CAST( r );
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows / 2 && x < cols / 2 )
+ if (x < cols / 2)
{
- __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
- __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
- __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset);
- __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset);
-
- int Y1 = ysrc[0];
- int Y2 = ysrc[1];
- int Y3 = ysrc[srcstep];
- int Y4 = ysrc[srcstep + 1];
-
- int U = usrc[0] - 128;
- int V = usrc[1] - 128;
-
- int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
- int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
- int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
-
- Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
- dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
- dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
- dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows / 2 )
+ {
+ __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
+ __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
+ __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset);
+ __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset);
+
+ int Y1 = ysrc[0];
+ int Y2 = ysrc[1];
+ int Y3 = ysrc[srcstep];
+ int Y4 = ysrc[srcstep + 1];
+
+ int U = usrc[0] - 128;
+ int V = usrc[1] - 128;
+
+ int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
+ int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
+ int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
+
+ Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
+ dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
+ dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
+ dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
- dst1[3] = 255;
+ dst1[3] = 255;
#endif
- Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
- dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
- dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
- dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
+ Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
+ dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
+ dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
+ dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
- dst1[7] = 255;
+ dst1[7] = 255;
#endif
- Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
- dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
- dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
- dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
+ Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
+ dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
+ dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
+ dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
- dst2[3] = 255;
+ dst2[3] = 255;
#endif
- Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
- dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
- dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
- dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
+ Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
+ dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
+ dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
+ dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
#if dcn == 4
- dst2[7] = 255;
+ dst2[7] = 255;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
- DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ DATA_TYPE_4 src_pix = vload4(0, src);
+ DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
#ifdef DEPTH_5
- __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
- DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0];
- DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
- DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
+ __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
+ DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0];
+ DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
+ DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
#else
- __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
- int delta = HALF_MAX * (1 << yuv_shift);
- int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift);
- int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
- int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
+ __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
+ int delta = HALF_MAX * (1 << yuv_shift);
+ int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift);
+ int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
+ int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
#endif
- dst[0] = SAT_CAST( Y );
- dst[1] = SAT_CAST( Cr );
- dst[2] = SAT_CAST( Cb );
+ dst[0] = SAT_CAST( Y );
+ dst[1] = SAT_CAST( Cr );
+ dst[2] = SAT_CAST( Cb );
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx);
- __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx);
+ __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx);
- DATA_TYPE y = srcptr[0], cr = srcptr[1], cb = srcptr[2];
+ DATA_TYPE_4 src_pix = vload4(0, srcptr);
+ DATA_TYPE y = src_pix.x, cr = src_pix.y, cb = src_pix.z;
#ifdef DEPTH_5
- __constant float * coeff = c_YCrCb2RGBCoeffs_f;
- float r = y + coeff[0] * (cr - HALF_MAX);
- float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX);
- float b = y + coeff[3] * (cb - HALF_MAX);
+ __constant float * coeff = c_YCrCb2RGBCoeffs_f;
+ float r = y + coeff[0] * (cr - HALF_MAX);
+ float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX);
+ float b = y + coeff[3] * (cb - HALF_MAX);
#else
- __constant int * coeff = c_YCrCb2RGBCoeffs_i;
- int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
- int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift);
- int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
+ __constant int * coeff = c_YCrCb2RGBCoeffs_i;
+ int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
+ int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift);
+ int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
#endif
- dstptr[(bidx^2)] = SAT_CAST(r);
- dstptr[1] = SAT_CAST(g);
- dstptr[bidx] = SAT_CAST(b);
+ dstptr[(bidx^2)] = SAT_CAST(r);
+ dstptr[1] = SAT_CAST(g);
+ dstptr[bidx] = SAT_CAST(b);
#if dcn == 4
- dstptr[3] = MAX_NUM;
+ dstptr[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
- int dy = get_global_id(1);
+ int dy = get_global_id(1) * PIX_PER_WI_Y;
- if (dy < rows && dx < cols)
+ if (dx < cols)
{
- int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
- int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (dy < rows)
+ {
+ int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
+ int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
- __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
- __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
+ __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
- DATA_TYPE r = src[0], g = src[1], b = src[2];
+ DATA_TYPE_4 src_pix = vload4(0, src);
+ DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
#ifdef DEPTH_5
- float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
- float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
- float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
+ float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
+ float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
+ float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
#else
- int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
- int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
- int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
+ int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
+ int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
+ int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
#endif
- dst[0] = SAT_CAST(x);
- dst[1] = SAT_CAST(y);
- dst[2] = SAT_CAST(z);
+ dst[0] = SAT_CAST(x);
+ dst[1] = SAT_CAST(y);
+ dst[2] = SAT_CAST(z);
+ }
+ ++dy;
+ }
}
}
int rows, int cols, __constant COEFF_TYPE * coeffs)
{
int dx = get_global_id(0);
- int dy = get_global_id(1);
+ int dy = get_global_id(1) * PIX_PER_WI_Y;
- if (dy < rows && dx < cols)
+ if (dx < cols)
{
- int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
- int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (dy < rows)
+ {
+ int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
+ int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
- __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
- __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
+ __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
- DATA_TYPE x = src[0], y = src[1], z = src[2];
+ DATA_TYPE_4 src_pix = vload4(0, src);
+ DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
#ifdef DEPTH_5
- float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
- float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
- float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
+ float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
+ float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
+ float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
#else
- int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
- int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
- int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
+ int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
+ int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
+ int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
#endif
- dst[0] = SAT_CAST(b);
- dst[1] = SAT_CAST(g);
- dst[2] = SAT_CAST(r);
+ dst[0] = SAT_CAST(b);
+ dst[1] = SAT_CAST(g);
+ dst[2] = SAT_CAST(r);
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ }
+ ++dy;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
- __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
+ __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ DATA_TYPE_4 src_pix = vload4(0, src);
#ifdef REVERSE
- dst[0] = src[2];
- dst[1] = src[1];
- dst[2] = src[0];
+ dst[0] = src_pix.z;
+ dst[1] = src_pix.y;
+ dst[2] = src_pix.x;
#else
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
+ dst[0] = src_pix.x;
+ dst[1] = src_pix.y;
+ dst[2] = src_pix.z;
#endif
#if dcn == 4
#if scn == 3
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#else
- dst[3] = src[3];
+ dst[3] = src[3];
#endif
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- ushort t = *((__global const ushort*)(src + src_idx));
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ ushort t = *((__global const ushort*)(src + src_idx));
#if greenbits == 6
- dst[dst_idx + bidx] = (uchar)(t << 3);
- dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
- dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
+ dst[dst_idx + bidx] = (uchar)(t << 3);
+ dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
+ dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
#else
- dst[dst_idx + bidx] = (uchar)(t << 3);
- dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
- dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
+ dst[dst_idx + bidx] = (uchar)(t << 3);
+ dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
+ dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
#endif
#if dcn == 4
#if greenbits == 6
- dst[dst_idx + 3] = 255;
+ dst[dst_idx + 3] = 255;
#else
- dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
+ dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
#endif
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ uchar4 src_pix = vload4(0, src + src_idx);
#if greenbits == 6
- *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~3) << 3)|((src[src_idx + (bidx^2)]&~7) << 8));
+ *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3
- *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|((src[src_idx + (bidx^2)]&~7) << 7));
+ *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else
- *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|
- ((src[src_idx + (bidx^2)]&~7) << 7)|(src[src_idx + 3] ? 0x8000 : 0));
+ *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
+ ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x);
- int t = *((__global const ushort*)(src + src_idx));
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x);
+ int t = *((__global const ushort*)(src + src_idx));
#if greenbits == 6
- dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
- ((t >> 3) & 0xfc)*G2Y +
- ((t >> 8) & 0xf8)*R2Y, yuv_shift);
+ dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
+ ((t >> 3) & 0xfc)*G2Y +
+ ((t >> 8) & 0xf8)*R2Y, yuv_shift);
#else
- dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
- ((t >> 2) & 0xf8)*G2Y +
- ((t >> 7) & 0xf8)*R2Y, yuv_shift);
+ dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
+ ((t >> 2) & 0xf8)*G2Y +
+ ((t >> 7) & 0xf8)*R2Y, yuv_shift);
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- int t = src[src_idx];
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ int t = src[src_idx];
#if greenbits == 6
- *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+ *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else
- t >>= 3;
- *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10));
+ t >>= 3;
+ *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10));
#endif
+ }
+ ++y;
+ }
}
}
__constant int * sdiv_table, __constant int * hdiv_table)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- int b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)];
- int h, s, v = b;
- int vmin = b, diff;
- int vr, vg;
-
- v = max( v, g );
- v = max( v, r );
- vmin = min( vmin, g );
- vmin = min( vmin, r );
-
- diff = v - vmin;
- vr = v == r ? -1 : 0;
- vg = v == g ? -1 : 0;
-
- s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
- h = (vr & (g - b)) +
- (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
- h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
- h += h < 0 ? hrange : 0;
-
- dst[dst_idx] = convert_uchar_sat_rte(h);
- dst[dst_idx + 1] = (uchar)s;
- dst[dst_idx + 2] = (uchar)v;
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ uchar4 src_pix = vload4(0, src + src_idx);
+
+ int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+ int h, s, v = b;
+ int vmin = b, diff;
+ int vr, vg;
+
+ v = max( v, g );
+ v = max( v, r );
+ vmin = min( vmin, g );
+ vmin = min( vmin, r );
+
+ diff = v - vmin;
+ vr = v == r ? -1 : 0;
+ vg = v == g ? -1 : 0;
+
+ s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+ h = (vr & (g - b)) +
+ (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+ h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+ h += h < 0 ? hrange : 0;
+
+ dst[dst_idx] = convert_uchar_sat_rte(h);
+ dst[dst_idx + 1] = (uchar)s;
+ dst[dst_idx + 2] = (uchar)v;
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- float h = src[src_idx], s = src[src_idx + 1]*(1/255.f), v = src[src_idx + 2]*(1/255.f);
- float b, g, r;
-
- if (s != 0)
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
- float tab[4];
- int sector;
- h *= hscale;
- if( h < 0 )
- do h += 6; while( h < 0 );
- else if( h >= 6 )
- do h -= 6; while( h >= 6 );
- sector = convert_int_sat_rtn(h);
- h -= sector;
- if( (unsigned)sector >= 6u )
+ if (y < rows)
{
- sector = 0;
- h = 0.f;
- }
-
- tab[0] = v;
- tab[1] = v*(1.f - s);
- tab[2] = v*(1.f - s*h);
- tab[3] = v*(1.f - s*(1.f - h));
-
- b = tab[sector_data[sector][0]];
- g = tab[sector_data[sector][1]];
- r = tab[sector_data[sector][2]];
- }
- else
- b = g = r = v;
-
- dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
- dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
- dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ uchar4 src_pix = vload4(0, src + src_idx);
+
+ float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
+ float b, g, r;
+
+ if (s != 0)
+ {
+ float tab[4];
+ int sector;
+ h *= hscale;
+ if( h < 0 )
+ do h += 6; while( h < 0 );
+ else if( h >= 6 )
+ do h -= 6; while( h >= 6 );
+ sector = convert_int_sat_rtn(h);
+ h -= sector;
+ if( (unsigned)sector >= 6u )
+ {
+ sector = 0;
+ h = 0.f;
+ }
+
+ tab[0] = v;
+ tab[1] = v*(1.f - s);
+ tab[2] = v*(1.f - s*h);
+ tab[3] = v*(1.f - s*(1.f - h));
+
+ b = tab[sector_data[sector][0]];
+ g = tab[sector_data[sector][1]];
+ r = tab[sector_data[sector][2]];
+ }
+ else
+ b = g = r = v;
+
+ dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
+ dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
+ dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
- dst[dst_idx + 3] = MAX_NUM;
+ dst[dst_idx + 3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
-
- float b = src[bidx], g = src[1], r = src[bidx^2];
- float h, s, v;
-
- float vmin, diff;
-
- v = vmin = r;
- if( v < g ) v = g;
- if( v < b ) v = b;
- if( vmin > g ) vmin = g;
- if( vmin > b ) vmin = b;
-
- diff = v - vmin;
- s = diff/(float)(fabs(v) + FLT_EPSILON);
- diff = (float)(60.f/(diff + FLT_EPSILON));
- if( v == r )
- h = (g - b)*diff;
- else if( v == g )
- h = (b - r)*diff + 120.f;
- else
- h = (r - g)*diff + 240.f;
-
- if( h < 0 ) h += 360.f;
-
- dst[0] = h*hscale;
- dst[1] = s;
- dst[2] = v;
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+ __global const float * src = (__global const float *)(srcptr + src_idx);
+ __global float * dst = (__global float *)(dstptr + dst_idx);
+ float4 src_pix = vload4(0, src);
+
+ float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+ float h, s, v;
+
+ float vmin, diff;
+
+ v = vmin = r;
+ if( v < g ) v = g;
+ if( v < b ) v = b;
+ if( vmin > g ) vmin = g;
+ if( vmin > b ) vmin = b;
+
+ diff = v - vmin;
+ s = diff/(float)(fabs(v) + FLT_EPSILON);
+ diff = (float)(60.f/(diff + FLT_EPSILON));
+ if( v == r )
+ h = (g - b)*diff;
+ else if( v == g )
+ h = (b - r)*diff + 120.f;
+ else
+ h = (r - g)*diff + 240.f;
+
+ if( h < 0 ) h += 360.f;
+
+ dst[0] = h*hscale;
+ dst[1] = s;
+ dst[2] = v;
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
-
- float h = src[0], s = src[1], v = src[2];
- float b, g, r;
-
- if (s != 0)
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
- float tab[4];
- int sector;
- h *= hscale;
- if(h < 0)
- do h += 6; while (h < 0);
- else if (h >= 6)
- do h -= 6; while (h >= 6);
- sector = convert_int_sat_rtn(h);
- h -= sector;
- if ((unsigned)sector >= 6u)
+ if (y < rows)
{
- sector = 0;
- h = 0.f;
- }
-
- tab[0] = v;
- tab[1] = v*(1.f - s);
- tab[2] = v*(1.f - s*h);
- tab[3] = v*(1.f - s*(1.f - h));
-
- b = tab[sector_data[sector][0]];
- g = tab[sector_data[sector][1]];
- r = tab[sector_data[sector][2]];
- }
- else
- b = g = r = v;
-
- dst[bidx] = b;
- dst[1] = g;
- dst[bidx^2] = r;
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+ __global const float * src = (__global const float *)(srcptr + src_idx);
+ __global float * dst = (__global float *)(dstptr + dst_idx);
+ float4 src_pix = vload4(0, src);
+
+ float h = src_pix.x, s = src_pix.y, v = src_pix.z;
+ float b, g, r;
+
+ if (s != 0)
+ {
+ float tab[4];
+ int sector;
+ h *= hscale;
+ if(h < 0)
+ do h += 6; while (h < 0);
+ else if (h >= 6)
+ do h -= 6; while (h >= 6);
+ sector = convert_int_sat_rtn(h);
+ h -= sector;
+ if ((unsigned)sector >= 6u)
+ {
+ sector = 0;
+ h = 0.f;
+ }
+
+ tab[0] = v;
+ tab[1] = v*(1.f - s);
+ tab[2] = v*(1.f - s*h);
+ tab[3] = v*(1.f - s*(1.f - h));
+
+ b = tab[sector_data[sector][0]];
+ g = tab[sector_data[sector][1]];
+ r = tab[sector_data[sector][2]];
+ }
+ else
+ b = g = r = v;
+
+ dst[bidx] = b;
+ dst[1] = g;
+ dst[bidx^2] = r;
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- float b = src[src_idx + bidx]*(1/255.f), g = src[src_idx + 1]*(1/255.f), r = src[src_idx + (bidx^2)]*(1/255.f);
- float h = 0.f, s = 0.f, l;
- float vmin, vmax, diff;
-
- vmax = vmin = r;
- if (vmax < g) vmax = g;
- if (vmax < b) vmax = b;
- if (vmin > g) vmin = g;
- if (vmin > b) vmin = b;
-
- diff = vmax - vmin;
- l = (vmax + vmin)*0.5f;
-
- if (diff > FLT_EPSILON)
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
- s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
- diff = 60.f/diff;
-
- if( vmax == r )
- h = (g - b)*diff;
- else if( vmax == g )
- h = (b - r)*diff + 120.f;
- else
- h = (r - g)*diff + 240.f;
-
- if( h < 0.f ) h += 360.f;
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ uchar4 src_pix = vload4(0, src + src_idx);
+
+ float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
+ float h = 0.f, s = 0.f, l;
+ float vmin, vmax, diff;
+
+ vmax = vmin = r;
+ if (vmax < g) vmax = g;
+ if (vmax < b) vmax = b;
+ if (vmin > g) vmin = g;
+ if (vmin > b) vmin = b;
+
+ diff = vmax - vmin;
+ l = (vmax + vmin)*0.5f;
+
+ if (diff > FLT_EPSILON)
+ {
+ s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+ diff = 60.f/diff;
+
+ if( vmax == r )
+ h = (g - b)*diff;
+ else if( vmax == g )
+ h = (b - r)*diff + 120.f;
+ else
+ h = (r - g)*diff + 240.f;
+
+ if( h < 0.f ) h += 360.f;
+ }
+
+ dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
+ dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
+ dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
+ }
+ ++y;
}
-
- dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
- dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
- dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- float h = src[src_idx], l = src[src_idx + 1]*(1.f/255.f), s = src[src_idx + 2]*(1.f/255.f);
- float b, g, r;
-
- if (s != 0)
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
- float tab[4];
-
- float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
- float p1 = 2*l - p2;
-
- h *= hscale;
- if( h < 0 )
- do h += 6; while( h < 0 );
- else if( h >= 6 )
- do h -= 6; while( h >= 6 );
-
- int sector = convert_int_sat_rtn(h);
- h -= sector;
-
- tab[0] = p2;
- tab[1] = p1;
- tab[2] = p1 + (p2 - p1)*(1-h);
- tab[3] = p1 + (p2 - p1)*h;
-
- b = tab[sector_data[sector][0]];
- g = tab[sector_data[sector][1]];
- r = tab[sector_data[sector][2]];
- }
- else
- b = g = r = l;
-
- dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
- dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
- dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ uchar4 src_pix = vload4(0, src + src_idx);
+
+ float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
+ float b, g, r;
+
+ if (s != 0)
+ {
+ float tab[4];
+
+ float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+ float p1 = 2*l - p2;
+
+ h *= hscale;
+ if( h < 0 )
+ do h += 6; while( h < 0 );
+ else if( h >= 6 )
+ do h -= 6; while( h >= 6 );
+
+ int sector = convert_int_sat_rtn(h);
+ h -= sector;
+
+ tab[0] = p2;
+ tab[1] = p1;
+ tab[2] = p1 + (p2 - p1)*(1-h);
+ tab[3] = p1 + (p2 - p1)*h;
+
+ b = tab[sector_data[sector][0]];
+ g = tab[sector_data[sector][1]];
+ r = tab[sector_data[sector][2]];
+ }
+ else
+ b = g = r = l;
+
+ dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
+ dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
+ dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
- dst[dst_idx + 3] = MAX_NUM;
+ dst[dst_idx + 3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
-
- float b = src[bidx], g = src[1], r = src[bidx^2];
- float h = 0.f, s = 0.f, l;
- float vmin, vmax, diff;
-
- vmax = vmin = r;
- if (vmax < g) vmax = g;
- if (vmax < b) vmax = b;
- if (vmin > g) vmin = g;
- if (vmin > b) vmin = b;
-
- diff = vmax - vmin;
- l = (vmax + vmin)*0.5f;
-
- if (diff > FLT_EPSILON)
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
- s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
- diff = 60.f/diff;
-
- if( vmax == r )
- h = (g - b)*diff;
- else if( vmax == g )
- h = (b - r)*diff + 120.f;
- else
- h = (r - g)*diff + 240.f;
-
- if( h < 0.f ) h += 360.f;
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+ __global const float * src = (__global const float *)(srcptr + src_idx);
+ __global float * dst = (__global float *)(dstptr + dst_idx);
+ float4 src_pix = vload4(0, src);
+
+ float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
+ float h = 0.f, s = 0.f, l;
+ float vmin, vmax, diff;
+
+ vmax = vmin = r;
+ if (vmax < g) vmax = g;
+ if (vmax < b) vmax = b;
+ if (vmin > g) vmin = g;
+ if (vmin > b) vmin = b;
+
+ diff = vmax - vmin;
+ l = (vmax + vmin)*0.5f;
+
+ if (diff > FLT_EPSILON)
+ {
+ s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
+ diff = 60.f/diff;
+
+ if( vmax == r )
+ h = (g - b)*diff;
+ else if( vmax == g )
+ h = (b - r)*diff + 120.f;
+ else
+ h = (r - g)*diff + 240.f;
+
+ if( h < 0.f ) h += 360.f;
+ }
+
+ dst[0] = h*hscale;
+ dst[1] = l;
+ dst[2] = s;
+ }
+ ++y;
}
-
- dst[0] = h*hscale;
- dst[1] = l;
- dst[2] = s;
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
-
- float h = src[0], l = src[1], s = src[2];
- float b, g, r;
-
- if (s != 0)
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
- float tab[4];
- int sector;
-
- float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
- float p1 = 2*l - p2;
-
- h *= hscale;
- if( h < 0 )
- do h += 6; while( h < 0 );
- else if( h >= 6 )
- do h -= 6; while( h >= 6 );
-
- sector = convert_int_sat_rtn(h);
- h -= sector;
-
- tab[0] = p2;
- tab[1] = p1;
- tab[2] = p1 + (p2 - p1)*(1-h);
- tab[3] = p1 + (p2 - p1)*h;
-
- b = tab[sector_data[sector][0]];
- g = tab[sector_data[sector][1]];
- r = tab[sector_data[sector][2]];
- }
- else
- b = g = r = l;
-
- dst[bidx] = b;
- dst[1] = g;
- dst[bidx^2] = r;
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+ __global const float * src = (__global const float *)(srcptr + src_idx);
+ __global float * dst = (__global float *)(dstptr + dst_idx);
+ float4 src_pix = vload4(0, src);
+
+ float h = src_pix.x, l = src_pix.y, s = src_pix.z;
+ float b, g, r;
+
+ if (s != 0)
+ {
+ float tab[4];
+ int sector;
+
+ float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
+ float p1 = 2*l - p2;
+
+ h *= hscale;
+ if( h < 0 )
+ do h += 6; while( h < 0 );
+ else if( h >= 6 )
+ do h -= 6; while( h >= 6 );
+
+ sector = convert_int_sat_rtn(h);
+ h -= sector;
+
+ tab[0] = p2;
+ tab[1] = p1;
+ tab[2] = p1 + (p2 - p1)*(1-h);
+ tab[3] = p1 + (p2 - p1)*h;
+
+ b = tab[sector_data[sector][0]];
+ g = tab[sector_data[sector][1]];
+ r = tab[sector_data[sector][2]];
+ }
+ else
+ b = g = r = l;
+
+ dst[bidx] = b;
+ dst[1] = g;
+ dst[bidx^2] = r;
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- x <<= 2;
- int src_idx = mad24(y, src_step, src_offset + x);
- int dst_idx = mad24(y, dst_step, dst_offset + x);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + (x << 2));
+ int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
+ uchar4 src_pix = vload4(0, src + src_idx);
- uchar v0 = src[src_idx], v1 = src[src_idx + 1];
- uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
+ uchar v0 = src_pix.x, v1 = src_pix.y;
+ uchar v2 = src_pix.z, v3 = src_pix.w;
- dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
- dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
- dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
- dst[dst_idx + 3] = v3;
+ dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
+ dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
+ dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
+ dst[dst_idx + 3] = v3;
+ }
+ ++y;
+ }
}
}
int rows, int cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- x <<= 2;
- int src_idx = mad24(y, src_step, src_offset + x);
- int dst_idx = mad24(y, dst_step, dst_offset + x);
-
- uchar v0 = src[src_idx], v1 = src[src_idx + 1];
- uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
- uchar v3_half = v3 / 2;
-
- dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
- dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
- dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
- dst[dst_idx + 3] = v3;
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + (x << 2));
+ int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
+ uchar4 src_pix = vload4(0, src + src_idx);
+
+ uchar v0 = src_pix.x, v1 = src_pix.y;
+ uchar v2 = src_pix.z, v3 = src_pix.w;
+ uchar v3_half = v3 / 2;
+
+ dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
+ dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
+ dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
+ dst[dst_idx + 3] = v3;
+ }
+ ++y;
+ }
}
}
__constant int * coeffs, int Lscale, int Lshift)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- src += src_idx;
- dst += dst_idx;
+ __global const uchar* src_ptr = src + src_idx;
+ __global uchar* dst_ptr = dst + dst_idx;
+ uchar4 src_pix = vload4(0, src_ptr);
- int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
- C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
- C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+ int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
- int R = gammaTab[src[0]], G = gammaTab[src[1]], B = gammaTab[src[2]];
- int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
- int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
- int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
+ int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
+ int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
+ int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
+ int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
- int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
- int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
- int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
+ int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
+ int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
+ int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
- dst[0] = SAT_CAST(L);
- dst[1] = SAT_CAST(a);
- dst[2] = SAT_CAST(b);
+ dst_ptr[0] = SAT_CAST(L);
+ dst_ptr[1] = SAT_CAST(a);
+ dst_ptr[2] = SAT_CAST(b);
+ }
+ ++y;
+ }
}
}
__constant float * coeffs, float _1_3, float _a)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_idx);
+ __global float * dst = (__global float *)(dstptr + dst_idx);
+ float4 src_pix = vload4(0, src);
- float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
- C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
- C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+ float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+ C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+ C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
- float R = clamp(src[0], 0.0f, 1.0f);
- float G = clamp(src[1], 0.0f, 1.0f);
- float B = clamp(src[2], 0.0f, 1.0f);
+ float R = clamp(src_pix.x, 0.0f, 1.0f);
+ float G = clamp(src_pix.y, 0.0f, 1.0f);
+ float B = clamp(src_pix.z, 0.0f, 1.0f);
#ifdef SRGB
- R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
- float X = R*C0 + G*C1 + B*C2;
- float Y = R*C3 + G*C4 + B*C5;
- float Z = R*C6 + G*C7 + B*C8;
+ float X = R*C0 + G*C1 + B*C2;
+ float Y = R*C3 + G*C4 + B*C5;
+ float Z = R*C6 + G*C7 + B*C8;
- float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
- float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
- float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
+ float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
+ float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
+ float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
- float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
- float a = 500.f * (FX - FY);
- float b = 200.f * (FY - FZ);
+ float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+ float a = 500.f * (FX - FY);
+ float b = 200.f * (FY - FZ);
- dst[0] = L;
- dst[1] = a;
- dst[2] = b;
+ dst[0] = L;
+ dst[1] = a;
+ dst[2] = b;
+ }
+ ++y;
+ }
}
}
__constant float * coeffs, float lThresh, float fThresh)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- src += src_idx;
- dst += dst_idx;
+ __global const uchar* src_ptr = src + src_idx;
+ __global uchar* dst_ptr = dst + dst_idx;
+ uchar4 src_pix = vload4(0, src_ptr);
- float srcbuf[3], dstbuf[3];
- srcbuf[0] = src[0]*(100.f/255.f);
- srcbuf[1] = convert_float(src[1] - 128);
- srcbuf[2] = convert_float(src[2] - 128);
+ float srcbuf[3], dstbuf[3];
+ srcbuf[0] = src_pix.x*(100.f/255.f);
+ srcbuf[1] = convert_float(src_pix.y - 128);
+ srcbuf[2] = convert_float(src_pix.z - 128);
- Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+ Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
- gammaTab,
+ gammaTab,
#endif
- coeffs, lThresh, fThresh);
+ coeffs, lThresh, fThresh);
- dst[0] = SAT_CAST(dstbuf[0] * 255.0f);
- dst[1] = SAT_CAST(dstbuf[1] * 255.0f);
- dst[2] = SAT_CAST(dstbuf[2] * 255.0f);
+ dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
+ dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
+ dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst_ptr[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}
__constant float * coeffs, float lThresh, float fThresh)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (y < rows && x < cols)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ {
+ if (y < rows)
+ {
+ int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+ int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_idx);
+ __global float * dst = (__global float *)(dstptr + dst_idx);
+ float4 src_pix = vload4(0, src);
- float srcbuf[3], dstbuf[3];
- srcbuf[0] = src[0], srcbuf[1] = src[1], srcbuf[2] = src[2];
+ float srcbuf[3], dstbuf[3];
+ srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
- Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+ Lab2BGR_f(&srcbuf[0], &dstbuf[0],
#ifdef SRGB
- gammaTab,
+ gammaTab,
#endif
- coeffs, lThresh, fThresh);
+ coeffs, lThresh, fThresh);
- dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
+ dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ }
+ ++y;
+ }
}
}