From 007593cab796a878cac5f24242b25f17daada932 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 4 Jun 2014 15:24:29 +0400 Subject: [PATCH] cvtColor - optimized index calculations; usage of build-in functions --- modules/imgproc/src/color.cpp | 18 +- modules/imgproc/src/opencl/cvtcolor.cl | 892 ++++++++++++++++++-------------- modules/imgproc/test/ocl/test_color.cpp | 6 +- 3 files changed, 524 insertions(+), 392 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 9440178..fe460ee 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2730,8 +2730,6 @@ struct mRGBA2RGBA #ifdef HAVE_OPENCL -#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) - static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) { bool ok = false; @@ -2739,23 +2737,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) Size sz = src.size(), dstSz = sz; int scn = src.channels(), depth = src.depth(), bidx; int dims = 2, stripeSize = 1; - size_t globalsize[] = { src.cols, src.rows }; ocl::Kernel k; if (depth != CV_8U && depth != CV_16U && depth != CV_32F) return false; - cv::String opts = format("-D depth=%d -D scn=%d ", depth, scn); - ocl::Device dev = ocl::Device::getDefault(); - int pxPerWIy = 1; - if (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) && - !(code == CV_BGR2Luv || code == CV_RGB2Luv || code == CV_LBGR2Luv || code == CV_LRGB2Luv || - code == CV_Luv2BGR || code == CV_Luv2RGB || code == CV_Luv2LBGR || code == CV_Luv2LRGB)) - pxPerWIy = 4; - - globalsize[1] = DIVUP(globalsize[1], pxPerWIy); - opts += format("-D PIX_PER_WI_Y=%d ", pxPerWIy); + int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; + + size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy }; + cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ", + depth, scn, pxPerWIy); switch (code) { diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index 5bad3ee..da835e0 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -71,10 +71,6 @@ #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)" #endif -#ifndef STRIPE_SIZE -#define STRIPE_SIZE 1 -#endif - #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) enum @@ -122,8 +118,8 @@ enum ///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// -__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols) { int x = get_global_id(0); @@ -131,27 +127,32 @@ __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); #ifdef DEPTH_5 - dst[0] = src_pix.B_COMP * 0.114f + src_pix.G_COMP * 0.587f + src_pix.R_COMP * 0.299f; + dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f)); #else - dst[0] = (DATA_TYPE)CV_DESCALE((src_pix.B_COMP * B2Y + src_pix.G_COMP * G2Y + src_pix.R_COMP * R2Y), yuv_shift); + dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, src_pix.R_COMP * R2Y)), yuv_shift); #endif + ++y; + src_index += src_step; + dst_index += dst_step; } - ++y; } } } -__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset, + __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols) { int x = get_global_id(0); @@ -159,20 +160,29 @@ __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index); DATA_TYPE val = src[0]; +#if dcn == 3 || defined DEPTH_5 dst[0] = dst[1] = dst[2] = val; #if dcn == 4 dst[3] = MAX_NUM; #endif +#else + *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM); +#endif + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -182,8 +192,8 @@ __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, __constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f }; __constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; -__kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset, + __global uchar* dstptr, int dst_step, int dt_offset, int rows, int cols) { int x = get_global_id(0); @@ -191,34 +201,40 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); - DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP; + DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YUVCoeffs_f; - const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; - const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX; - const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX; + const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2])); + const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX); + const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX); #else __constant int * coeffs = c_RGB2YUVCoeffs_i; const int delta = HALF_MAX * (1 << yuv_shift); - const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); - const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift); - const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift); + const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], r * coeffs[2])), yuv_shift); + const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift); + const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift); #endif dst[0] = SAT_CAST( Y ); dst[1] = SAT_CAST( U ); dst[2] = SAT_CAST( V ); + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -226,8 +242,8 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; __constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; -__kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset, + __global uchar* dstptr, int dst_step, int dt_offset, int rows, int cols) { int x = get_global_id(0); @@ -235,25 +251,28 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z; #ifdef DEPTH_5 __constant float * coeffs = c_YUV2RGBCoeffs_f; - const float r = Y + (V - HALF_MAX) * coeffs[3]; - const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; - const float b = Y + (U - HALF_MAX) * coeffs[0]; + float r = fma(V - HALF_MAX, coeffs[3], Y); + float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y)); + float b = fma(U - HALF_MAX, coeffs[0], Y); #else __constant int * coeffs = c_YUV2RGBCoeffs_i; const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); - const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], (U - HALF_MAX) * coeffs[1]), yuv_shift); const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); #endif @@ -263,8 +282,10 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, #if dcn == 4 dst[3] = MAX_NUM; #endif + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -276,8 +297,8 @@ __constant int ITUR_BT_601_CVG = 852492; __constant int ITUR_BT_601_CVR = 1673527; __constant int ITUR_BT_601_SHIFT = 20; -__kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void YUV2RGB_NV12(__global const uchar* srcptr, int src_step, int src_offset, + __global uchar* dstptr, int dst_step, int dt_offset, int rows, int cols) { int x = get_global_id(0); @@ -290,15 +311,15 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoff { if (y < rows / 2 ) { - __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset); - __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset); - __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset); - __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset); + __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset); + __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset); + __global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset); + __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dst_step, x * (dcn<<1) + dt_offset); int Y1 = ysrc[0]; int Y2 = ysrc[1]; - int Y3 = ysrc[srcstep]; - int Y4 = ysrc[srcstep + 1]; + int Y3 = ysrc[src_step]; + int Y4 = ysrc[src_step + 1]; int U = usrc[0] - 128; int V = usrc[1] - 128; @@ -349,8 +370,8 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoff __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; __constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; -__kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset, - __global uchar* dstptr, int dststep, int dstoffset, +__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset, + __global uchar* dstptr, int dst_step, int dt_offset, int rows, int cols) { int x = get_global_id(0); @@ -358,34 +379,40 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); - DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP; + DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YCrCbCoeffs_f; - DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0]; - DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX; + DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0])); + DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX); + DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX); #else __constant int * coeffs = c_RGB2YCrCbCoeffs_i; int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift); - int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift); + int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], r * coeffs[0])), yuv_shift); + int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift); + int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift); #endif dst[0] = SAT_CAST( Y ); dst[1] = SAT_CAST( Cr ); dst[2] = SAT_CAST( Cb ); + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -402,28 +429,29 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx); - __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx); + __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index); + __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index); DATA_TYPE_4 src_pix = vload4(0, srcptr); DATA_TYPE y = src_pix.x, cr = src_pix.y, cb = src_pix.z; #ifdef DEPTH_5 __constant float * coeff = c_YCrCb2RGBCoeffs_f; - float r = y + coeff[0] * (cr - HALF_MAX); - float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX); - float b = y + coeff[3] * (cb - HALF_MAX); + float r = fma(coeff[0], cr - HALF_MAX, y); + float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, y)); + float b = fma(coeff[3], cb - HALF_MAX, y); #else __constant int * coeff = c_YCrCb2RGBCoeffs_i; int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift); - int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift); + int g = y + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift); int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift); #endif @@ -433,8 +461,11 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset, #if dcn == 4 dstptr[3] = MAX_NUM; #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -450,34 +481,37 @@ __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offse if (dx < cols) { + int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset)); + int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (dy < rows) { - int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); - int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); - - __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); - __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index); + __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z; #ifdef DEPTH_5 - float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; - float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; - float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; + float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2])); + float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5])); + float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8])); #else - int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); - int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); - int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); + int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift); + int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift); + int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift); #endif dst[0] = SAT_CAST(x); dst[1] = SAT_CAST(y); dst[2] = SAT_CAST(z); + + ++dy; + dst_index += dst_step; + src_index += src_step; } - ++dy; } } } @@ -491,37 +525,48 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse if (dx < cols) { + int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset)); + int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (dy < rows) { - int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); - int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); - - __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); - __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index); + __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z; #ifdef DEPTH_5 - float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; - float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; - float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; + float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2])); + float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5])); + float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8])); #else - int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); - int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); - int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); + int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift); + int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift); + int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift); #endif - dst[0] = SAT_CAST(b); - dst[1] = SAT_CAST(g); - dst[2] = SAT_CAST(r); + + DATA_TYPE dst0 = SAT_CAST(b); + DATA_TYPE dst1 = SAT_CAST(g); + DATA_TYPE dst2 = SAT_CAST(r); +#if dcn == 3 || defined DEPTH_5 + dst[0] = dst0; + dst[1] = dst1; + dst[2] = dst2; #if dcn == 4 dst[3] = MAX_NUM; #endif +#else + *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM); +#endif + + ++dy; + dst_index += dst_step; + src_index += src_step; } - ++dy; } } } @@ -537,16 +582,16 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); - __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index); + __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index); DATA_TYPE_4 src_pix = vload4(0, src); #ifdef REVERSE @@ -566,8 +611,11 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset, dst[3] = src[3]; #endif #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -583,34 +631,38 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - ushort t = *((__global const ushort*)(src + src_idx)); + ushort t = *((__global const ushort*)(src + src_index)); #if greenbits == 6 - dst[dst_idx + bidx] = (uchar)(t << 3); - dst[dst_idx + 1] = (uchar)((t >> 3) & ~3); - dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7); + dst[dst_index + bidx] = (uchar)(t << 3); + dst[dst_index + 1] = (uchar)((t >> 3) & ~3); + dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7); #else - dst[dst_idx + bidx] = (uchar)(t << 3); - dst[dst_idx + 1] = (uchar)((t >> 2) & ~7); - dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7); + dst[dst_index + bidx] = (uchar)(t << 3); + dst[dst_index + 1] = (uchar)((t >> 2) & ~7); + dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7); #endif #if dcn == 4 #if greenbits == 6 - dst[dst_idx + 3] = 255; + dst[dst_index + 3] = 255; #else - dst[dst_idx + 3] = t & 0x8000 ? 255 : 0; + dst[dst_index + 3] = t & 0x8000 ? 255 : 0; #endif #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -624,25 +676,29 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - uchar4 src_pix = vload4(0, src + src_idx); + uchar4 src_pix = vload4(0, src + src_index); #if greenbits == 6 - *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8)); + *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8)); #elif scn == 3 - *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7)); + *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7)); #else - *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)| + *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)| ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0)); #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -658,26 +714,25 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, dst_offset + x); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x); - int t = *((__global const ushort*)(src + src_idx)); + int t = *((__global const ushort*)(src + src_index)); #if greenbits == 6 - dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + - ((t >> 3) & 0xfc)*G2Y + - ((t >> 8) & 0xf8)*R2Y, yuv_shift); + dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift); #else - dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + - ((t >> 2) & 0xf8)*G2Y + - ((t >> 7) & 0xf8)*R2Y, yuv_shift); + dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift); #endif + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -691,23 +746,26 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse if (x < cols) { + int src_index = mad24(y, src_step, src_offset + x); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - int t = src[src_idx]; + int t = src[src_index]; #if greenbits == 6 - *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8)); + *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8)); #else t >>= 3; - *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10)); + *((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10)); #endif + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -733,40 +791,44 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - uchar4 src_pix = vload4(0, src + src_idx); + uchar4 src_pix = vload4(0, src + src_index); int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; int h, s, v = b; int vmin = b, diff; int vr, vg; - v = max( v, g ); - v = max( v, r ); - vmin = min( vmin, g ); - vmin = min( vmin, r ); + v = max(v, g); + v = max(v, r); + vmin = min(vmin, g); + vmin = min(vmin, r); diff = v - vmin; vr = v == r ? -1 : 0; vg = v == g ? -1 : 0; - s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; + s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift; h = (vr & (g - b)) + - (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); - h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; + (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g)))); + h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift; h += h < 0 ? hrange : 0; - dst[dst_idx] = convert_uchar_sat_rte(h); - dst[dst_idx + 1] = (uchar)s; - dst[dst_idx + 2] = (uchar)v; + dst[dst_index] = convert_uchar_sat_rte(h); + dst[dst_index + 1] = (uchar)s; + dst[dst_index + 2] = (uchar)v; + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -780,14 +842,15 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - uchar4 src_pix = vload4(0, src + src_idx); + uchar4 src_pix = vload4(0, src + src_index); float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f); float b, g, r; @@ -821,14 +884,17 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset, else b = g = r = v; - dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); - dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); - dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); + dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f); + dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f); + dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst[dst_index + 3] = MAX_NUM; #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -844,16 +910,16 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); float4 src_pix = vload4(0, src); float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; @@ -873,17 +939,21 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset if( v == r ) h = (g - b)*diff; else if( v == g ) - h = (b - r)*diff + 120.f; + h = fma(b - r, diff, 120.f); else - h = (r - g)*diff + 240.f; + h = fma(r - g, diff, 240.f); - if( h < 0 ) h += 360.f; + if( h < 0 ) + h += 360.f; dst[0] = h*hscale; dst[1] = s; dst[2] = v; + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -897,16 +967,17 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); float4 src_pix = vload4(0, src); float h = src_pix.x, s = src_pix.y, v = src_pix.z; @@ -947,8 +1018,11 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset #if dcn == 4 dst[3] = MAX_NUM; #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -968,14 +1042,15 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - uchar4 src_pix = vload4(0, src + src_idx); + uchar4 src_pix = vload4(0, src + src_index); float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f); float h = 0.f, s = 0.f, l; @@ -998,18 +1073,22 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset, if( vmax == r ) h = (g - b)*diff; else if( vmax == g ) - h = (b - r)*diff + 120.f; + h = fma(b - r, diff, 120.f); else - h = (r - g)*diff + 240.f; + h = fma(r - g, diff, 240.f); - if( h < 0.f ) h += 360.f; + if( h < 0.f ) + h += 360.f; } - dst[dst_idx] = convert_uchar_sat_rte(h*hscale); - dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f); - dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f); + dst[dst_index] = convert_uchar_sat_rte(h*hscale); + dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f); + dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f); + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1023,14 +1102,15 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - uchar4 src_pix = vload4(0, src + src_idx); + uchar4 src_pix = vload4(0, src + src_index); float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f); float b, g, r; @@ -1053,8 +1133,8 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset, tab[0] = p2; tab[1] = p1; - tab[2] = p1 + (p2 - p1)*(1-h); - tab[3] = p1 + (p2 - p1)*h; + tab[2] = fma(p2 - p1, 1-h, p1); + tab[3] = fma(p2 - p1, h, p1); b = tab[sector_data[sector][0]]; g = tab[sector_data[sector][1]]; @@ -1063,14 +1143,17 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset, else b = g = r = l; - dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); - dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); - dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); + dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f); + dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f); + dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst[dst_index + 3] = MAX_NUM; #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1086,16 +1169,16 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); float4 src_pix = vload4(0, src); float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; @@ -1119,9 +1202,9 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset if( vmax == r ) h = (g - b)*diff; else if( vmax == g ) - h = (b - r)*diff + 120.f; + h = fma(b - r, diff, 120.f); else - h = (r - g)*diff + 240.f; + h = fma(r - g, diff, 240.f); if( h < 0.f ) h += 360.f; } @@ -1129,8 +1212,11 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset dst[0] = h*hscale; dst[1] = l; dst[2] = s; + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1144,16 +1230,16 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); float4 src_pix = vload4(0, src); float h = src_pix.x, l = src_pix.y, s = src_pix.z; @@ -1178,8 +1264,8 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset tab[0] = p2; tab[1] = p1; - tab[2] = p1 + (p2 - p1)*(1-h); - tab[3] = p1 + (p2 - p1)*h; + tab[2] = fma(p2 - p1, 1-h, p1); + tab[3] = fma(p2 - p1, h, p1); b = tab[sector_data[sector][0]]; g = tab[sector_data[sector][1]]; @@ -1194,8 +1280,11 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset #if dcn == 4 dst[3] = MAX_NUM; #endif + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1215,24 +1304,25 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, src_offset + (x << 2)); + int dst_index = mad24(y, dst_step, dst_offset + (x << 2)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + (x << 2)); - int dst_idx = mad24(y, dst_step, dst_offset + (x << 2)); - uchar4 src_pix = vload4(0, src + src_idx); + uchar4 src_pix = *(__global const uchar4 *)(src + src_index); - uchar v0 = src_pix.x, v1 = src_pix.y; - uchar v2 = src_pix.z, v3 = src_pix.w; + *(__global uchar4 *)(dst + dst_index) = + (uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM, + mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM, + mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w); - dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 3] = v3; + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1246,25 +1336,29 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, 4, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + (x << 2)); - int dst_idx = mad24(y, dst_step, dst_offset + (x << 2)); - uchar4 src_pix = vload4(0, src + src_idx); - - uchar v0 = src_pix.x, v1 = src_pix.y; - uchar v2 = src_pix.z, v3 = src_pix.w; - uchar v3_half = v3 / 2; - - dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 3] = v3; + uchar4 src_pix = *(__global const uchar4 *)(src + src_index); + uchar v3 = src_pix.w, v3_half = v3 / 2; + + if (v3 == 0) + *(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0); + else + *(__global uchar4 *)(dst + dst_index) = + (uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3, + mad24(src_pix.y, MAX_NUM, v3_half) / v3, + mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3); + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1283,8 +1377,8 @@ inline float splineInterpolate(float x, __global const float * tab, int n) { int ix = clamp(convert_int_sat_rtn(x), 0, n-1); x -= ix; - tab += ix*4; - return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0]; + tab += ix << 2; + return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]); } #ifdef DEPTH_0 @@ -1299,16 +1393,16 @@ __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const uchar* src_ptr = src + src_idx; - __global uchar* dst_ptr = dst + dst_idx; + __global const uchar* src_ptr = src + src_index; + __global uchar* dst_ptr = dst + dst_index; uchar4 src_pix = vload4(0, src_ptr); int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], @@ -1316,19 +1410,22 @@ __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset, C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z]; - int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)]; - int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; - int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)]; + int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)]; + int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)]; + int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)]; int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 ); - int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 ); - int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 ); + int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 ); + int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 ); dst_ptr[0] = SAT_CAST(L); dst_ptr[1] = SAT_CAST(a); dst_ptr[2] = SAT_CAST(b); + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1347,16 +1444,16 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); float4 src_pix = vload4(0, src); float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], @@ -1373,23 +1470,26 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif - float X = R*C0 + G*C1 + B*C2; - float Y = R*C3 + G*C4 + B*C5; - float Z = R*C6 + G*C7 + B*C8; + float X = fma(R, C0, fma(G, C1, B*C2)); + float Y = fma(R, C3, fma(G, C4, B*C5)); + float Z = fma(R, C6, fma(G, C7, B*C8)); - float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a); - float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a); - float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a); + float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a); + float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a); + float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a); - float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y); + float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y); float a = 500.f * (FX - FY); float b = 200.f * (FY - FZ); dst[0] = L; dst[1] = a; dst[2] = b; + + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1412,7 +1512,7 @@ inline void Lab2BGR_f(const float * srcbuf, float * dstbuf, if (li <= lThresh) { y = li / 903.3f; - fy = 7.787f * y + 16.0f / 116.0f; + fy = fma(7.787f, y, 16.0f / 116.0f); } else { @@ -1422,6 +1522,7 @@ inline void Lab2BGR_f(const float * srcbuf, float * dstbuf, float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f }; + #pragma unroll for (int j = 0; j < 2; j++) if (fxz[j] <= fThresh) fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f; @@ -1429,9 +1530,9 @@ inline void Lab2BGR_f(const float * srcbuf, float * dstbuf, fxz[j] = fxz[j] * fxz[j] * fxz[j]; float x = fxz[0], z = fxz[1]; - float ro = clamp(C0 * x + C1 * y + C2 * z, 0.0f, 1.0f); - float go = clamp(C3 * x + C4 * y + C5 * z, 0.0f, 1.0f); - float bo = clamp(C6 * x + C7 * y + C8 * z, 0.0f, 1.0f); + float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f); + float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f); + float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f); #ifdef SRGB ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); @@ -1456,16 +1557,16 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset, if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const uchar* src_ptr = src + src_idx; - __global uchar* dst_ptr = dst + dst_idx; + __global const uchar* src_ptr = src + src_index; + __global uchar * dst_ptr = dst + dst_index; uchar4 src_pix = vload4(0, src_ptr); float srcbuf[3], dstbuf[3]; @@ -1479,14 +1580,18 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset, #endif coeffs, lThresh, fThresh); +#if dcn == 3 dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f); dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f); dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f); -#if dcn == 4 - dst_ptr[3] = MAX_NUM; +#else + *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f), + SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM); #endif + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1505,16 +1610,16 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse if (x < cols) { + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { if (y < rows) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); float4 src_pix = vload4(0, src); float srcbuf[3], dstbuf[3]; @@ -1530,8 +1635,10 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse #if dcn == 4 dst[3] = MAX_NUM; #endif + ++y; + dst_index += dst_step; + src_index += src_step; } - ++y; } } } @@ -1555,37 +1662,46 @@ __kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offse __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (x < cols && y < rows) + if (x < cols) { - int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset)); - int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + if (y < rows) + { + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); - float R = src[0], G = src[1], B = src[2]; + float R = src[0], G = src[1], B = src[2]; #ifdef SRGB - R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif - float X = R*coeffs[0] + G*coeffs[1] + B*coeffs[2]; - float Y = R*coeffs[3] + G*coeffs[4] + B*coeffs[5]; - float Z = R*coeffs[6] + G*coeffs[7] + B*coeffs[8]; + float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2])); + float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5])); + float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8])); - float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); - L = 116.f*L - 16.f; + float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); + L = fma(116.f, L, -16.f); - float d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON); - float u = L*(X*d - _un); - float v = L*((9*0.25f)*Y*d - _vn); + float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON); + float u = L*fma(X, d, -_un); + float v = L*fma(2.25f, Y*d, -_vn); - dst[0] = L; - dst[1] = u; - dst[2] = v; + dst[0] = L; + dst[1] = u; + dst[2] = v; + + ++y; + dst_index += dst_step; + src_index += src_step; + } } } @@ -1599,38 +1715,44 @@ __kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset, __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (x < cols && y < rows) + if (x < cols) { - int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset)); - int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + src += mad24(y, src_step, mad24(x, scnbytes, src_offset)); + dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); - src += src_idx; - dst += dst_idx; - - float scale = 1.0f / 255.0f; - float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + if (y < rows) + { + float scale = 1.0f / 255.0f; + float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale; #ifdef SRGB - R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif - float X = R*coeffs[0] + G*coeffs[1] + B*coeffs[2]; - float Y = R*coeffs[3] + G*coeffs[4] + B*coeffs[5]; - float Z = R*coeffs[6] + G*coeffs[7] + B*coeffs[8]; + float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2])); + float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5])); + float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8])); + + float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); + L = 116.f*L - 16.f; - float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE); - L = 116.f*L - 16.f; + float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON); + float u = L*(X*d - _un); + float v = L*fma(2.25f, Y*d, -_vn); - float d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON); - float u = L*(X*d - _un); - float v = L*((9*0.25f)*Y*d - _vn); + dst[0] = SAT_CAST(L * 2.55f); + dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f)); + dst[2] = SAT_CAST(fma(v, 0.99609375f, 139.453125f)); - dst[0] = SAT_CAST(L * 2.55f); - dst[1] = SAT_CAST(mad(u, 0.72033898305084743f, 96.525423728813564f)); - dst[2] = SAT_CAST(mad(v, 0.99609375f, 139.453125f)); + ++y; + dst += dst_step; + src += src_step; + } } } @@ -1646,42 +1768,50 @@ __kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offse __constant float * coeffs, float _un, float _vn) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (x < cols && y < rows) + if (x < cols) { - int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset)); - int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); - - float L = src[0], u = src[1], v = src[2], d, X, Y, Z; - Y = (L + 16.f) * (1.f/116.f); - Y = Y*Y*Y; - d = (1.f/13.f)/L; - u = u*d + _un; - v = v*d + _vn; - float iv = 1.f/v; - X = 2.25f * u * Y * iv ; - Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv; - - float R = X*coeffs[0] + Y*coeffs[1] + Z*coeffs[2]; - float G = X*coeffs[3] + Y*coeffs[4] + Z*coeffs[5]; - float B = X*coeffs[6] + Y*coeffs[7] + Z*coeffs[8]; + int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + if (y < rows) + { + __global const float * src = (__global const float *)(srcptr + src_index); + __global float * dst = (__global float *)(dstptr + dst_index); + + float L = src[0], u = src[1], v = src[2], d, X, Y, Z; + Y = (L + 16.f) * (1.f/116.f); + Y = Y*Y*Y; + d = (1.f/13.f)/L; + u = fma(u, d, _un); + v = fma(v, d, _vn); + float iv = 1.f/v; + X = 2.25f * u * Y * iv; + Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv; + + float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2])); + float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5])); + float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8])); #ifdef SRGB - R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif - dst[0] = R; - dst[1] = G; - dst[2] = B; + dst[0] = R; + dst[1] = G; + dst[2] = B; #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + ++y; + dst_index += dst_step; + src_index += src_step; + } } } @@ -1695,46 +1825,56 @@ __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset, __constant float * coeffs, float _un, float _vn) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (x < cols && y < rows) + if (x < cols) { - int src_idx = mad24(y, src_step, mad24(x, scnbytes, src_offset)); - int dst_idx = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); - - src += src_idx; - dst += dst_idx; - - float d, X, Y, Z; - float L = src[0]*(100.f/255.f); - float u = (float)(src[1]*1.388235294117647f - 134.f); - float v = (float)(src[2]*1.003921568627451f - 140.f); - Y = (L + 16.f) * (1.f/116.f); - Y = Y*Y*Y; - d = (1.f/13.f)/L; - u = u*d + _un; - v = v*d + _vn; - float iv = 1.f/v; - X = 2.25f * u * Y * iv ; - Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv; - - float R = X*coeffs[0] + Y*coeffs[1] + Z*coeffs[2]; - float G = X*coeffs[3] + Y*coeffs[4] + Z*coeffs[5]; - float B = X*coeffs[6] + Y*coeffs[7] + Z*coeffs[8]; + src += mad24(y, src_step, mad24(x, scnbytes, src_offset)); + dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset)); + + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + if (y < rows) + { + float d, X, Y, Z; + float L = src[0]*(100.f/255.f); + float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f); + float v = fma(convert_float(src[2]), 1.003921568627451f, - 140.f); + Y = (L + 16.f) * (1.f/116.f); + Y = Y*Y*Y; + d = (1.f/13.f)/L; + u = fma(u, d, _un); + v = fma(v, d, _vn); + float iv = 1.f/v; + X = 2.25f * u * Y * iv ; + Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv; + + float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2])); + float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5])); + float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8])); #ifdef SRGB - R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif - dst[0] = SAT_CAST(R * 255.0f); - dst[1] = SAT_CAST(G * 255.0f); - dst[2] = SAT_CAST(B * 255.0f); + uchar dst0 = SAT_CAST(R * 255.0f); + uchar dst1 = SAT_CAST(G * 255.0f); + uchar dst2 = SAT_CAST(B * 255.0f); #if dcn == 4 - dst[3] = MAX_NUM; + *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM); +#else + dst[0] = dst0; + dst[1] = dst1; + dst[2] = dst2; #endif + + ++y; + dst += dst_step; + src += src_step; + } } } diff --git a/modules/imgproc/test/ocl/test_color.cpp b/modules/imgproc/test/ocl/test_color.cpp index e9fb0d3..82bf2c0 100644 --- a/modules/imgproc/test/ocl/test_color.cpp +++ b/modules/imgproc/test/ocl/test_color.cpp @@ -305,11 +305,11 @@ OCL_TEST_P(CvtColor8u32f, Lab2LRGBA) { performTest(3, 4, CVTCODE(Lab2LRGB), dept OCL_TEST_P(CvtColor8u32f, BGR2Luv) { performTest(3, 3, CVTCODE(BGR2Luv), depth == CV_8U ? 1 : 1e-2); } OCL_TEST_P(CvtColor8u32f, RGB2Luv) { performTest(3, 3, CVTCODE(RGB2Luv), depth == CV_8U ? 1 : 1e-2); } OCL_TEST_P(CvtColor8u32f, LBGR2Luv) { performTest(3, 3, CVTCODE(LBGR2Luv), depth == CV_8U ? 1 : 4e-3); } -OCL_TEST_P(CvtColor8u32f, LRGB2Luv) { performTest(3, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 4e-3); } +OCL_TEST_P(CvtColor8u32f, LRGB2Luv) { performTest(3, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 5e-3); } OCL_TEST_P(CvtColor8u32f, BGRA2Luv) { performTest(4, 3, CVTCODE(BGR2Luv), depth == CV_8U ? 1 : 8e-3); } OCL_TEST_P(CvtColor8u32f, RGBA2Luv) { performTest(4, 3, CVTCODE(RGB2Luv), depth == CV_8U ? 1 : 9e-3); } -OCL_TEST_P(CvtColor8u32f, LBGRA2Luv) { performTest(4, 3, CVTCODE(LBGR2Luv), depth == CV_8U ? 1 : 4e-3); } -OCL_TEST_P(CvtColor8u32f, LRGBA2Luv) { performTest(4, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 4e-3); } +OCL_TEST_P(CvtColor8u32f, LBGRA2Luv) { performTest(4, 3, CVTCODE(LBGR2Luv), depth == CV_8U ? 1 : 5e-3); } +OCL_TEST_P(CvtColor8u32f, LRGBA2Luv) { performTest(4, 3, CVTCODE(LRGB2Luv), depth == CV_8U ? 1 : 5e-3); } OCL_TEST_P(CvtColor8u32f, Luv2BGR) { performTest(3, 3, CVTCODE(Luv2BGR), depth == CV_8U ? 1 : 7e-5); } OCL_TEST_P(CvtColor8u32f, Luv2RGB) { performTest(3, 3, CVTCODE(Luv2RGB), depth == CV_8U ? 1 : 7e-5); } -- 2.7.4