#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
#endif
-#ifndef STRIPE_SIZE
-#define STRIPE_SIZE 1
-#endif
-
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
enum
///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
-__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,
+ __global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
#ifdef DEPTH_5
- dst[0] = src_pix.B_COMP * 0.114f + src_pix.G_COMP * 0.587f + src_pix.R_COMP * 0.299f;
+ dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));
#else
- dst[0] = (DATA_TYPE)CV_DESCALE((src_pix.B_COMP * B2Y + src_pix.G_COMP * G2Y + src_pix.R_COMP * R2Y), yuv_shift);
+ dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, src_pix.R_COMP * R2Y)), yuv_shift);
#endif
+ ++y;
+ src_index += src_step;
+ dst_index += dst_step;
}
- ++y;
}
}
}
-__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,
+ __global uchar * dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE val = src[0];
+#if dcn == 3 || defined DEPTH_5
dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
+#else
+ *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);
+#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
__constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
-__kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,
+ __global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols)
{
int x = get_global_id(0);
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
- DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
+ DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
#ifdef DEPTH_5
__constant float * coeffs = c_RGB2YUVCoeffs_f;
- const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
- const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
- const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
+ const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));
+ const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX);
+ const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX);
#else
__constant int * coeffs = c_RGB2YUVCoeffs_i;
const int delta = HALF_MAX * (1 << yuv_shift);
- const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
- const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
- const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
+ const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], r * coeffs[2])), yuv_shift);
+ const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
+ const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
#endif
dst[0] = SAT_CAST( Y );
dst[1] = SAT_CAST( U );
dst[2] = SAT_CAST( V );
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
__constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
-__kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
+ __global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols)
{
int x = get_global_id(0);
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;
#ifdef DEPTH_5
__constant float * coeffs = c_YUV2RGBCoeffs_f;
- const float r = Y + (V - HALF_MAX) * coeffs[3];
- const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
- const float b = Y + (U - HALF_MAX) * coeffs[0];
+ float r = fma(V - HALF_MAX, coeffs[3], Y);
+ float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y));
+ float b = fma(U - HALF_MAX, coeffs[0], Y);
#else
__constant int * coeffs = c_YUV2RGBCoeffs_i;
const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
- const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+ const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], (U - HALF_MAX) * coeffs[1]), yuv_shift);
const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
#endif
#if dcn == 4
dst[3] = MAX_NUM;
#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
__constant int ITUR_BT_601_CVR = 1673527;
__constant int ITUR_BT_601_SHIFT = 20;
-__kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void YUV2RGB_NV12(__global const uchar* srcptr, int src_step, int src_offset,
+ __global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols)
{
int x = get_global_id(0);
{
if (y < rows / 2 )
{
- __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
- __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
- __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset);
- __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset);
+ __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
+ __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
+ __global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
+ __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dst_step, x * (dcn<<1) + dt_offset);
int Y1 = ysrc[0];
int Y2 = ysrc[1];
- int Y3 = ysrc[srcstep];
- int Y4 = ysrc[srcstep + 1];
+ int Y3 = ysrc[src_step];
+ int Y4 = ysrc[src_step + 1];
int U = usrc[0] - 128;
int V = usrc[1] - 128;
__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
-__kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,
+ __global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols)
{
int x = get_global_id(0);
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
- __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+ __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+ __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
- DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP;
+ DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
#ifdef DEPTH_5
__constant float * coeffs = c_RGB2YCrCbCoeffs_f;
- DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0];
- DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
- DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
+ DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));
+ DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX);
+ DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX);
#else
__constant int * coeffs = c_RGB2YCrCbCoeffs_i;
int delta = HALF_MAX * (1 << yuv_shift);
- int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift);
- int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
- int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
+ int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], r * coeffs[0])), yuv_shift);
+ int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
+ int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
#endif
dst[0] = SAT_CAST( Y );
dst[1] = SAT_CAST( Cr );
dst[2] = SAT_CAST( Cb );
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx);
- __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx);
+ __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);
+ __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);
DATA_TYPE_4 src_pix = vload4(0, srcptr);
DATA_TYPE y = src_pix.x, cr = src_pix.y, cb = src_pix.z;
#ifdef DEPTH_5
__constant float * coeff = c_YCrCb2RGBCoeffs_f;
- float r = y + coeff[0] * (cr - HALF_MAX);
- float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX);
- float b = y + coeff[3] * (cb - HALF_MAX);
+ float r = fma(coeff[0], cr - HALF_MAX, y);
+ float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, y));
+ float b = fma(coeff[3], cb - HALF_MAX, y);
#else
__constant int * coeff = c_YCrCb2RGBCoeffs_i;
int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);
- int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift);
+ int g = y + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift);
int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);
#endif
#if dcn == 4
dstptr[3] = MAX_NUM;
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (dx < cols)
{
+ int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+ int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
- int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
- int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
-
- __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
- __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+ __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
#ifdef DEPTH_5
- float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
- float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
- float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
+ float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
+ float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
+ float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
#else
- int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
- int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
- int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
+ int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
+ int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
+ int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
#endif
dst[0] = SAT_CAST(x);
dst[1] = SAT_CAST(y);
dst[2] = SAT_CAST(z);
+
+ ++dy;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++dy;
}
}
}
if (dx < cols)
{
+ int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+ int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (dy < rows)
{
- int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes);
- int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes);
-
- __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
- __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+ __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
#ifdef DEPTH_5
- float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
- float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
- float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
+ float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
+ float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
+ float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
#else
- int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
- int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
- int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
+ int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
+ int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
+ int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
#endif
- dst[0] = SAT_CAST(b);
- dst[1] = SAT_CAST(g);
- dst[2] = SAT_CAST(r);
+
+ DATA_TYPE dst0 = SAT_CAST(b);
+ DATA_TYPE dst1 = SAT_CAST(g);
+ DATA_TYPE dst2 = SAT_CAST(r);
+#if dcn == 3 || defined DEPTH_5
+ dst[0] = dst0;
+ dst[1] = dst1;
+ dst[2] = dst2;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
+#else
+ *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
+#endif
+
+ ++dy;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++dy;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx);
- __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx);
+ __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+ __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
DATA_TYPE_4 src_pix = vload4(0, src);
#ifdef REVERSE
dst[3] = src[3];
#endif
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- ushort t = *((__global const ushort*)(src + src_idx));
+ ushort t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
- dst[dst_idx + bidx] = (uchar)(t << 3);
- dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
- dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
+ dst[dst_index + bidx] = (uchar)(t << 3);
+ dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
+ dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
#else
- dst[dst_idx + bidx] = (uchar)(t << 3);
- dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
- dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
+ dst[dst_index + bidx] = (uchar)(t << 3);
+ dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
+ dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
#endif
#if dcn == 4
#if greenbits == 6
- dst[dst_idx + 3] = 255;
+ dst[dst_index + 3] = 255;
#else
- dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
+ dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
#endif
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- uchar4 src_pix = vload4(0, src + src_idx);
+ uchar4 src_pix = vload4(0, src + src_index);
#if greenbits == 6
- *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
+ *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3
- *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
+ *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else
- *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
+ *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, dst_offset + x);
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x);
- int t = *((__global const ushort*)(src + src_idx));
+ int t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
- dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
- ((t >> 3) & 0xfc)*G2Y +
- ((t >> 8) & 0xf8)*R2Y, yuv_shift);
+ dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);
#else
- dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
- ((t >> 2) & 0xf8)*G2Y +
- ((t >> 7) & 0xf8)*R2Y, yuv_shift);
+ dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);
#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, src_offset + x);
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- int t = src[src_idx];
+ int t = src[src_index];
#if greenbits == 6
- *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+ *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else
t >>= 3;
- *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10));
+ *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));
#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- uchar4 src_pix = vload4(0, src + src_idx);
+ uchar4 src_pix = vload4(0, src + src_index);
int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
int h, s, v = b;
int vmin = b, diff;
int vr, vg;
- v = max( v, g );
- v = max( v, r );
- vmin = min( vmin, g );
- vmin = min( vmin, r );
+ v = max(v, g);
+ v = max(v, r);
+ vmin = min(vmin, g);
+ vmin = min(vmin, r);
diff = v - vmin;
vr = v == r ? -1 : 0;
vg = v == g ? -1 : 0;
- s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+ s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;
h = (vr & (g - b)) +
- (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
- h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+ (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
+ h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
h += h < 0 ? hrange : 0;
- dst[dst_idx] = convert_uchar_sat_rte(h);
- dst[dst_idx + 1] = (uchar)s;
- dst[dst_idx + 2] = (uchar)v;
+ dst[dst_index] = convert_uchar_sat_rte(h);
+ dst[dst_index + 1] = (uchar)s;
+ dst[dst_index + 2] = (uchar)v;
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- uchar4 src_pix = vload4(0, src + src_idx);
+ uchar4 src_pix = vload4(0, src + src_index);
float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);
float b, g, r;
else
b = g = r = v;
- dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
- dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
- dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+ dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+ dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
+ dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
- dst[dst_idx + 3] = MAX_NUM;
+ dst[dst_index + 3] = MAX_NUM;
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
if( v == r )
h = (g - b)*diff;
else if( v == g )
- h = (b - r)*diff + 120.f;
+ h = fma(b - r, diff, 120.f);
else
- h = (r - g)*diff + 240.f;
+ h = fma(r - g, diff, 240.f);
- if( h < 0 ) h += 360.f;
+ if( h < 0 )
+ h += 360.f;
dst[0] = h*hscale;
dst[1] = s;
dst[2] = v;
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float h = src_pix.x, s = src_pix.y, v = src_pix.z;
#if dcn == 4
dst[3] = MAX_NUM;
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- uchar4 src_pix = vload4(0, src + src_idx);
+ uchar4 src_pix = vload4(0, src + src_index);
float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);
float h = 0.f, s = 0.f, l;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
- h = (b - r)*diff + 120.f;
+ h = fma(b - r, diff, 120.f);
else
- h = (r - g)*diff + 240.f;
+ h = fma(r - g, diff, 240.f);
- if( h < 0.f ) h += 360.f;
+ if( h < 0.f )
+ h += 360.f;
}
- dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
- dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
- dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
+ dst[dst_index] = convert_uchar_sat_rte(h*hscale);
+ dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
+ dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
- uchar4 src_pix = vload4(0, src + src_idx);
+ uchar4 src_pix = vload4(0, src + src_index);
float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);
float b, g, r;
tab[0] = p2;
tab[1] = p1;
- tab[2] = p1 + (p2 - p1)*(1-h);
- tab[3] = p1 + (p2 - p1)*h;
+ tab[2] = fma(p2 - p1, 1-h, p1);
+ tab[3] = fma(p2 - p1, h, p1);
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
else
b = g = r = l;
- dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
- dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
- dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
+ dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+ dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
+ dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
- dst[dst_idx + 3] = MAX_NUM;
+ dst[dst_index + 3] = MAX_NUM;
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;
if( vmax == r )
h = (g - b)*diff;
else if( vmax == g )
- h = (b - r)*diff + 120.f;
+ h = fma(b - r, diff, 120.f);
else
- h = (r - g)*diff + 240.f;
+ h = fma(r - g, diff, 240.f);
if( h < 0.f ) h += 360.f;
}
dst[0] = h*hscale;
dst[1] = l;
dst[2] = s;
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float h = src_pix.x, l = src_pix.y, s = src_pix.z;
tab[0] = p2;
tab[1] = p1;
- tab[2] = p1 + (p2 - p1)*(1-h);
- tab[3] = p1 + (p2 - p1)*h;
+ tab[2] = fma(p2 - p1, 1-h, p1);
+ tab[3] = fma(p2 - p1, h, p1);
b = tab[sector_data[sector][0]];
g = tab[sector_data[sector][1]];
#if dcn == 4
dst[3] = MAX_NUM;
#endif
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, src_offset + (x << 2));
+ int dst_index = mad24(y, dst_step, dst_offset + (x << 2));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
- int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
- uchar4 src_pix = vload4(0, src + src_idx);
+ uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
- uchar v0 = src_pix.x, v1 = src_pix.y;
- uchar v2 = src_pix.z, v3 = src_pix.w;
+ *(__global uchar4 *)(dst + dst_index) =
+ (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM,
+ mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM,
+ mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w);
- dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
- dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
- dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
- dst[dst_idx + 3] = v3;
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, 4, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
- int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
- uchar4 src_pix = vload4(0, src + src_idx);
-
- uchar v0 = src_pix.x, v1 = src_pix.y;
- uchar v2 = src_pix.z, v3 = src_pix.w;
- uchar v3_half = v3 / 2;
-
- dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
- dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
- dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
- dst[dst_idx + 3] = v3;
+ uchar4 src_pix = *(__global const uchar4 *)(src + src_index);
+ uchar v3 = src_pix.w, v3_half = v3 / 2;
+
+ if (v3 == 0)
+ *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);
+ else
+ *(__global uchar4 *)(dst + dst_index) =
+ (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,
+ mad24(src_pix.y, MAX_NUM, v3_half) / v3,
+ mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
{
int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
x -= ix;
- tab += ix*4;
- return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
+ tab += ix << 2;
+ return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
}
#ifdef DEPTH_0
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const uchar* src_ptr = src + src_idx;
- __global uchar* dst_ptr = dst + dst_idx;
+ __global const uchar* src_ptr = src + src_index;
+ __global uchar* dst_ptr = dst + dst_index;
uchar4 src_pix = vload4(0, src_ptr);
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
- int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
- int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
- int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
+ int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
+ int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
+ int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
- int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
- int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
+ int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
+ int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
dst_ptr[0] = SAT_CAST(L);
dst_ptr[1] = SAT_CAST(a);
dst_ptr[2] = SAT_CAST(b);
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
- float X = R*C0 + G*C1 + B*C2;
- float Y = R*C3 + G*C4 + B*C5;
- float Z = R*C6 + G*C7 + B*C8;
+ float X = fma(R, C0, fma(G, C1, B*C2));
+ float Y = fma(R, C3, fma(G, C4, B*C5));
+ float Z = fma(R, C6, fma(G, C7, B*C8));
- float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
- float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
- float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
+ float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
+ float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
+ float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
- float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+ float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
float a = 500.f * (FX - FY);
float b = 200.f * (FY - FZ);
dst[0] = L;
dst[1] = a;
dst[2] = b;
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (li <= lThresh)
{
y = li / 903.3f;
- fy = 7.787f * y + 16.0f / 116.0f;
+ fy = fma(7.787f, y, 16.0f / 116.0f);
}
else
{
float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
+ #pragma unroll
for (int j = 0; j < 2; j++)
if (fxz[j] <= fThresh)
fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
fxz[j] = fxz[j] * fxz[j] * fxz[j];
float x = fxz[0], z = fxz[1];
- float ro = clamp(C0 * x + C1 * y + C2 * z, 0.0f, 1.0f);
- float go = clamp(C3 * x + C4 * y + C5 * z, 0.0f, 1.0f);
- float bo = clamp(C6 * x + C7 * y + C8 * z, 0.0f, 1.0f);
+ float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
+ float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
+ float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
#ifdef SRGB
ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const uchar* src_ptr = src + src_idx;
- __global uchar* dst_ptr = dst + dst_idx;
+ __global const uchar* src_ptr = src + src_index;
+ __global uchar * dst_ptr = dst + dst_index;
uchar4 src_pix = vload4(0, src_ptr);
float srcbuf[3], dstbuf[3];
#endif
coeffs, lThresh, fThresh);
+#if dcn == 3
dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
-#if dcn == 4
- dst_ptr[3] = MAX_NUM;
+#else
+ *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
+ SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
if (x < cols)
{
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows)
{
- int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
- int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
float4 src_pix = vload4(0, src);
float srcbuf[3], dstbuf[3];
#if dcn == 4
dst[3] = MAX_NUM;
#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
}
- ++y;
}
}
}
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset));
- int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ if (y < rows)
+ {
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
- float R = src[0], G = src[1], B = src[2];
+ float R = src[0], G = src[1], B = src[2];
#ifdef SRGB
- R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
- float X = R*coeffs[0] + G*coeffs[1] + B*coeffs[2];
- float Y = R*coeffs[3] + G*coeffs[4] + B*coeffs[5];
- float Z = R*coeffs[6] + G*coeffs[7] + B*coeffs[8];
+ float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+ float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+ float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
- float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
- L = 116.f*L - 16.f;
+ float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+ L = fma(116.f, L, -16.f);
- float d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON);
- float u = L*(X*d - _un);
- float v = L*((9*0.25f)*Y*d - _vn);
+ float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+ float u = L*fma(X, d, -_un);
+ float v = L*fma(2.25f, Y*d, -_vn);
- dst[0] = L;
- dst[1] = u;
- dst[2] = v;
+ dst[0] = L;
+ dst[1] = u;
+ dst[2] = v;
+
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
+ }
}
}
__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset));
- int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+ src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
- src += src_idx;
- dst += dst_idx;
-
- float scale = 1.0f / 255.0f;
- float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ if (y < rows)
+ {
+ float scale = 1.0f / 255.0f;
+ float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
#ifdef SRGB
- R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
- float X = R*coeffs[0] + G*coeffs[1] + B*coeffs[2];
- float Y = R*coeffs[3] + G*coeffs[4] + B*coeffs[5];
- float Z = R*coeffs[6] + G*coeffs[7] + B*coeffs[8];
+ float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+ float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+ float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+ float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+ L = 116.f*L - 16.f;
- float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
- L = 116.f*L - 16.f;
+ float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+ float u = L*(X*d - _un);
+ float v = L*fma(2.25f, Y*d, -_vn);
- float d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON);
- float u = L*(X*d - _un);
- float v = L*((9*0.25f)*Y*d - _vn);
+ dst[0] = SAT_CAST(L * 2.55f);
+ dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
+ dst[2] = SAT_CAST(fma(v, 0.99609375f, 139.453125f));
- dst[0] = SAT_CAST(L * 2.55f);
- dst[1] = SAT_CAST(mad(u, 0.72033898305084743f, 96.525423728813564f));
- dst[2] = SAT_CAST(mad(v, 0.99609375f, 139.453125f));
+ ++y;
+ dst += dst_step;
+ src += src_step;
+ }
}
}
__constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset));
- int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
- __global const float * src = (__global const float *)(srcptr + src_idx);
- __global float * dst = (__global float *)(dstptr + dst_idx);
-
- float L = src[0], u = src[1], v = src[2], d, X, Y, Z;
- Y = (L + 16.f) * (1.f/116.f);
- Y = Y*Y*Y;
- d = (1.f/13.f)/L;
- u = u*d + _un;
- v = v*d + _vn;
- float iv = 1.f/v;
- X = 2.25f * u * Y * iv ;
- Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
-
- float R = X*coeffs[0] + Y*coeffs[1] + Z*coeffs[2];
- float G = X*coeffs[3] + Y*coeffs[4] + Z*coeffs[5];
- float B = X*coeffs[6] + Y*coeffs[7] + Z*coeffs[8];
+ int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ if (y < rows)
+ {
+ __global const float * src = (__global const float *)(srcptr + src_index);
+ __global float * dst = (__global float *)(dstptr + dst_index);
+
+ float L = src[0], u = src[1], v = src[2], d, X, Y, Z;
+ Y = (L + 16.f) * (1.f/116.f);
+ Y = Y*Y*Y;
+ d = (1.f/13.f)/L;
+ u = fma(u, d, _un);
+ v = fma(v, d, _vn);
+ float iv = 1.f/v;
+ X = 2.25f * u * Y * iv;
+ Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
+
+ float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+ float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+ float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
#ifdef SRGB
- R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
- dst[0] = R;
- dst[1] = G;
- dst[2] = B;
+ dst[0] = R;
+ dst[1] = G;
+ dst[2] = B;
#if dcn == 4
- dst[3] = MAX_NUM;
+ dst[3] = MAX_NUM;
#endif
+ ++y;
+ dst_index += dst_step;
+ src_index += src_step;
+ }
}
}
__constant float * coeffs, float _un, float _vn)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1) * PIX_PER_WI_Y;
- if (x < cols && y < rows)
+ if (x < cols)
{
- int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset));
- int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
-
- src += src_idx;
- dst += dst_idx;
-
- float d, X, Y, Z;
- float L = src[0]*(100.f/255.f);
- float u = (float)(src[1]*1.388235294117647f - 134.f);
- float v = (float)(src[2]*1.003921568627451f - 140.f);
- Y = (L + 16.f) * (1.f/116.f);
- Y = Y*Y*Y;
- d = (1.f/13.f)/L;
- u = u*d + _un;
- v = v*d + _vn;
- float iv = 1.f/v;
- X = 2.25f * u * Y * iv ;
- Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
-
- float R = X*coeffs[0] + Y*coeffs[1] + Z*coeffs[2];
- float G = X*coeffs[3] + Y*coeffs[4] + Z*coeffs[5];
- float B = X*coeffs[6] + Y*coeffs[7] + Z*coeffs[8];
+ src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+ dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+ if (y < rows)
+ {
+ float d, X, Y, Z;
+ float L = src[0]*(100.f/255.f);
+ float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
+ float v = fma(convert_float(src[2]), 1.003921568627451f, - 140.f);
+ Y = (L + 16.f) * (1.f/116.f);
+ Y = Y*Y*Y;
+ d = (1.f/13.f)/L;
+ u = fma(u, d, _un);
+ v = fma(v, d, _vn);
+ float iv = 1.f/v;
+ X = 2.25f * u * Y * iv ;
+ Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;
+
+ float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+ float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+ float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
#ifdef SRGB
- R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
- B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+ B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
#endif
- dst[0] = SAT_CAST(R * 255.0f);
- dst[1] = SAT_CAST(G * 255.0f);
- dst[2] = SAT_CAST(B * 255.0f);
+ uchar dst0 = SAT_CAST(R * 255.0f);
+ uchar dst1 = SAT_CAST(G * 255.0f);
+ uchar dst2 = SAT_CAST(B * 255.0f);
#if dcn == 4
- dst[3] = MAX_NUM;
+ *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
+#else
+ dst[0] = dst0;
+ dst[1] = dst1;
+ dst[2] = dst2;
#endif
+
+ ++y;
+ dst += dst_step;
+ src += src_step;
+ }
}
}