Added loading 4 pixels in line instead of 2 to RGB[A] -> YUV(420) kernel
[profile/ivi/opencv.git] / modules / imgproc / src / opencl / cvtcolor.cl
index c3cfd0d..e660a52 100644 (file)
@@ -119,6 +119,10 @@ enum
 #define yidx 0
 #endif
 
+#ifndef PIX_PER_WI_X
+#define PIX_PER_WI_X 1
+#endif
+
 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
 
@@ -454,7 +458,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
                                 __global uchar* dstptr, int dst_step, int dst_offset,
                                 int rows, int cols)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * PIX_PER_WI_X;
     int y = get_global_id(1) * PIX_PER_WI_Y;
 
     if (x < cols/2)
@@ -463,6 +467,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
         int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
         int y_rows = rows / 3 * 2;
         int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
+        __constant float* coeffs = c_RGB2YUVCoeffs_420;
 
         #pragma unroll
         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
@@ -477,12 +482,61 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
                 __global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
                 __global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
 
+#if PIX_PER_WI_X == 2
+                int s11 = *((__global const int*) src1);
+                int s12 = *((__global const int*) src1 + 1);
+                int s13 = *((__global const int*) src1 + 2);
+#if scn == 4
+                int s14 = *((__global const int*) src1 + 3);
+#endif
+                int s21 = *((__global const int*) src2);
+                int s22 = *((__global const int*) src2 + 1);
+                int s23 = *((__global const int*) src2 + 2);
+#if scn == 4
+                int s24 = *((__global const int*) src2 + 3);
+#endif
+                float src_pix1[scn * 4], src_pix2[scn * 4];
+
+                *((float4*) src_pix1)     = convert_float4(as_uchar4(s11));
+                *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
+                *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
+#if scn == 4
+                *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
+#endif
+                *((float4*) src_pix2)     = convert_float4(as_uchar4(s21));
+                *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
+                *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
+#if scn == 4
+                *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
+#endif
+                uchar4 y1, y2;
+                y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-bidx], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      bidx], 16.5f))));
+                y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  scn+2-bidx], fma(coeffs[1], src_pix1[  scn+1], fma(coeffs[2], src_pix1[  scn+bidx], 16.5f))));
+                y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
+                y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
+                y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-bidx], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      bidx], 16.5f))));
+                y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  scn+2-bidx], fma(coeffs[1], src_pix2[  scn+1], fma(coeffs[2], src_pix2[  scn+bidx], 16.5f))));
+                y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
+                y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
+
+                *((__global int*) ydst1) = as_int(y1);
+                *((__global int*) ydst2) = as_int(y2);
+
+                float uv[4] = { fma(coeffs[3], src_pix1[      2-bidx], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      bidx], 128.5f))),
+                                fma(coeffs[5], src_pix1[      2-bidx], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      bidx], 128.5f))),
+                                fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
+                                fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
+
+                udst[0] = convert_uchar_sat(uv[uidx]    );
+                vdst[0] = convert_uchar_sat(uv[1 - uidx]);
+                udst[1] = convert_uchar_sat(uv[2 + uidx]);
+                vdst[1] = convert_uchar_sat(uv[3 - uidx]);
+#else
                 float4 src_pix1 = convert_float4(vload4(0, src1));
                 float4 src_pix2 = convert_float4(vload4(0, src1+scn));
                 float4 src_pix3 = convert_float4(vload4(0, src2));
                 float4 src_pix4 = convert_float4(vload4(0, src2+scn));
 
-                __constant float* coeffs = c_RGB2YUVCoeffs_420;
                 ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
                 ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
                 ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
@@ -493,7 +547,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
 
                 udst[0] = convert_uchar_sat(uv[uidx]  );
                 vdst[0] = convert_uchar_sat(uv[1-uidx]);
-
+#endif
                 ++y;
                 src_index += 2*src_step;
                 ydst_index += 2*dst_step;
@@ -522,7 +576,6 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
                 float U = ((float) src[uidx]) - HALF_MAX;
                 float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;
 
-
                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
                 float ruv = fma(coeffs[4], V, 0.5f);
                 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
@@ -535,7 +588,6 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
 #if dcn == 4
                 dst[3]        = 255;
 #endif
-
                 float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
                 dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
                 dst[dcn + 1]        = convert_uchar_sat(y01 + guv);