ocl::Device dev = ocl::Device::getDefault();
int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
+ int pxPerWIx = 1;
size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
dstSz = Size(sz.width, sz.height / 2 * 3);
- globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
+ _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
+ dst = _dst.getUMat();
+
+ if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
+ dst.step % 4 == 0 && dst.offset % 4 == 0)
+ {
+ pxPerWIx = 2;
+ }
+ globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
+
k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
- opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
- break;
+ opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
+ k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
+ return k.run(2, globalsize, NULL, false);
}
case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
#define yidx 0
#endif
+#ifndef PIX_PER_WI_X
+#define PIX_PER_WI_X 1
+#endif
+
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
- int x = get_global_id(0);
+ int x = get_global_id(0) * PIX_PER_WI_X;
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols/2)
int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
int y_rows = rows / 3 * 2;
int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
+ __constant float* coeffs = c_RGB2YUVCoeffs_420;
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
__global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
__global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
+#if PIX_PER_WI_X == 2
+ int s11 = *((__global const int*) src1);
+ int s12 = *((__global const int*) src1 + 1);
+ int s13 = *((__global const int*) src1 + 2);
+#if scn == 4
+ int s14 = *((__global const int*) src1 + 3);
+#endif
+ int s21 = *((__global const int*) src2);
+ int s22 = *((__global const int*) src2 + 1);
+ int s23 = *((__global const int*) src2 + 2);
+#if scn == 4
+ int s24 = *((__global const int*) src2 + 3);
+#endif
+ float src_pix1[scn * 4], src_pix2[scn * 4];
+
+ *((float4*) src_pix1) = convert_float4(as_uchar4(s11));
+ *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
+ *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
+#if scn == 4
+ *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
+#endif
+ *((float4*) src_pix2) = convert_float4(as_uchar4(s21));
+ *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
+ *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
+#if scn == 4
+ *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
+#endif
+ uchar4 y1, y2;
+ y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f))));
+ y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f))));
+ y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
+ y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
+ y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f))));
+ y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f))));
+ y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
+ y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
+
+ *((__global int*) ydst1) = as_int(y1);
+ *((__global int*) ydst2) = as_int(y2);
+
+ float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))),
+ fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))),
+ fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
+ fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
+
+ udst[0] = convert_uchar_sat(uv[uidx] );
+ vdst[0] = convert_uchar_sat(uv[1 - uidx]);
+ udst[1] = convert_uchar_sat(uv[2 + uidx]);
+ vdst[1] = convert_uchar_sat(uv[3 - uidx]);
+#else
float4 src_pix1 = convert_float4(vload4(0, src1));
float4 src_pix2 = convert_float4(vload4(0, src1+scn));
float4 src_pix3 = convert_float4(vload4(0, src2));
float4 src_pix4 = convert_float4(vload4(0, src2+scn));
- __constant float* coeffs = c_RGB2YUVCoeffs_420;
ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
udst[0] = convert_uchar_sat(uv[uidx] );
vdst[0] = convert_uchar_sat(uv[1-uidx]);
-
+#endif
++y;
src_index += 2*src_step;
ydst_index += 2*dst_step;
float U = ((float) src[uidx]) - HALF_MAX;
float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;
-
__constant float* coeffs = c_YUV2RGBCoeffs_420;
float ruv = fma(coeffs[4], V, 0.5f);
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
#if dcn == 4
dst[3] = 255;
#endif
-
float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
dst[dcn + 1] = convert_uchar_sat(y01 + guv);