From 1466621f99f0418fb575c68caabb8c200e43904f Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Mon, 27 Oct 2014 14:52:17 +0300 Subject: [PATCH] Added loading 4 pixels in line instead of 2 to RGB[A] -> YUV(420) kernel --- modules/imgproc/src/color.cpp | 17 ++++++++-- modules/imgproc/src/opencl/cvtcolor.cl | 62 +++++++++++++++++++++++++++++++--- 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f363189..dcbfb8f 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -4857,6 +4857,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) ocl::Device dev = ocl::Device::getDefault(); int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; + int pxPerWIx = 1; size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy }; cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ", @@ -5025,10 +5026,20 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 ); dstSz = Size(sz.width, sz.height / 2 * 3); - globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy; + _dst.create(dstSz, CV_MAKETYPE(depth, dcn)); + dst = _dst.getUMat(); + + if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 && + dst.step % 4 == 0 && dst.offset % 4 == 0) + { + pxPerWIx = 2; + } + globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy; + k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc, - opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx)); - break; + opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx)); + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); + return k.run(2, globalsize, NULL, false); } case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index c3cfd0d..e660a52 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -119,6 +119,10 @@ enum #define yidx 0 #endif +#ifndef PIX_PER_WI_X +#define PIX_PER_WI_X 1 +#endif + #define __CAT(x, y) x##y #define CAT(x, y) __CAT(x, y) @@ -454,7 +458,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int __global uchar* dstptr, int dst_step, int dst_offset, int rows, int cols) { - int x = get_global_id(0); + int x = get_global_id(0) * PIX_PER_WI_X; int y = get_global_id(1) * PIX_PER_WI_Y; if (x < cols/2) @@ -463,6 +467,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset); int y_rows = rows / 3 * 2; int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)}; + __constant float* coeffs = c_RGB2YUVCoeffs_420; #pragma unroll for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) @@ -477,12 +482,61 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int __global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x); __global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0); +#if PIX_PER_WI_X == 2 + int s11 = *((__global const int*) src1); + int s12 = *((__global const int*) src1 + 1); + int s13 = *((__global const int*) src1 + 2); +#if scn == 4 + int s14 = *((__global const int*) src1 + 3); +#endif + int s21 = *((__global const int*) src2); + int s22 = *((__global const int*) src2 + 1); + int s23 = *((__global const int*) src2 + 2); +#if scn == 4 + int s24 = *((__global const int*) src2 + 3); +#endif + float src_pix1[scn * 4], src_pix2[scn * 4]; + + *((float4*) src_pix1) = convert_float4(as_uchar4(s11)); + *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12)); + *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13)); +#if scn == 4 + *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14)); +#endif + *((float4*) src_pix2) = convert_float4(as_uchar4(s21)); + *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22)); + *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23)); +#if scn == 4 + *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24)); +#endif + uchar4 y1, y2; + y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f)))); + y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f)))); + y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f)))); + y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f)))); + y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f)))); + y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f)))); + y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f)))); + y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f)))); + + *((__global int*) ydst1) = as_int(y1); + *((__global int*) ydst2) = as_int(y2); + + float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))), + fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))), + fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))), + fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) }; + + udst[0] = convert_uchar_sat(uv[uidx] ); + vdst[0] = convert_uchar_sat(uv[1 - uidx]); + udst[1] = convert_uchar_sat(uv[2 + uidx]); + vdst[1] = convert_uchar_sat(uv[3 - uidx]); +#else float4 src_pix1 = convert_float4(vload4(0, src1)); float4 src_pix2 = convert_float4(vload4(0, src1+scn)); float4 src_pix3 = convert_float4(vload4(0, src2)); float4 src_pix4 = convert_float4(vload4(0, src2+scn)); - __constant float* coeffs = c_RGB2YUVCoeffs_420; ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f)))); ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f)))); ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f)))); @@ -493,7 +547,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int udst[0] = convert_uchar_sat(uv[uidx] ); vdst[0] = convert_uchar_sat(uv[1-uidx]); - +#endif ++y; src_index += 2*src_step; ydst_index += 2*dst_step; @@ -522,7 +576,6 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of float U = ((float) src[uidx]) - HALF_MAX; float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX; - __constant float* coeffs = c_YUV2RGBCoeffs_420; float ruv = fma(coeffs[4], V, 0.5f); float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f)); @@ -535,7 +588,6 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of #if dcn == 4 dst[3] = 255; #endif - float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0]; dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv); dst[dcn + 1] = convert_uchar_sat(y01 + guv); -- 2.7.4