modules/imgproc/src/opencl/warp_transform.cl

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 __constant short4 vec_offset = (short4)(0, 1, 2, 3);
   6
   7 #define GET_VAL(x, y) ((x) < 0 || (x) >= src_cols || (y) < 0 || (y) >= src_rows) ? scalar : src[src_offset + y * src_step + x]
   8
   9 __kernel void warpAffine_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
  10                                     __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
  11                                     __constant float * M, ST scalar_)
  12 {
  13     int x = get_global_id(0) * 4;
  14     int y = get_global_id(1);
  15     uchar scalar = convert_uchar_sat_rte(scalar_);
  16
  17     if (x >= dst_cols || y >= dst_rows) return;
  18
  19     /* { M0, M1, M2 }
  20      * { M3, M4, M5 }
  21      */
  22
  23     short4 new_x, new_y;
  24     new_x = convert_short4_sat_rte(M[0] * convert_float4(vec_offset + (short4)(x)) +
  25                                    M[1] * convert_float4((short4)y) + M[2]);
  26
  27     new_y = convert_short4_sat_rte(M[3] * convert_float4(vec_offset + (short4)(x)) +
  28                                    M[4] * convert_float4((short4)y) + M[5]);
  29
  30     uchar4 pix = (uchar4)scalar;
  31
  32     pix.s0 = GET_VAL(new_x.s0, new_y.s0);
  33     pix.s1 = GET_VAL(new_x.s1, new_y.s1);
  34     pix.s2 = GET_VAL(new_x.s2, new_y.s2);
  35     pix.s3 = GET_VAL(new_x.s3, new_y.s3);
  36
  37     int dst_index = x + y * dst_step + dst_offset;
  38
  39     vstore4(pix, 0,  dst + dst_index);
  40 }
  41
  42 uchar4 read_pixels(__global const uchar * src, short tx, short ty,
  43                    int src_offset, int src_step, int src_cols, int
  44                    src_rows, uchar scalar)
  45 {
  46     uchar2 pt, pb;
  47     short bx, by;
  48
  49     bx = tx + 1;
  50     by = ty + 1;
  51
  52     if (tx >= 0 && (tx + 1) < src_cols && ty >= 0 && ty < src_rows)
  53     {
  54         pt = vload2(0, src + src_offset + ty * src_step + tx);
  55     }
  56     else
  57     {
  58         pt.s0 = GET_VAL(tx, ty);
  59         pt.s1 = GET_VAL(bx, ty);
  60     }
  61
  62     if (tx >= 0 && (tx + 1) < src_cols && by >= 0 && by < src_rows)
  63     {
  64         pb = vload2(0, src + src_offset + by * src_step + tx);
  65     }
  66     else
  67     {
  68         pb.s0 = GET_VAL(tx, by);
  69         pb.s1 = GET_VAL(bx, by);
  70     }
  71
  72     return (uchar4)(pt, pb);
  73 }
  74
  75 __kernel void warpAffine_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
  76                                    __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
  77                                    __constant float * M, ST scalar_)
  78 {
  79     int x = get_global_id(0) * 4;
  80     int y = get_global_id(1);
  81     uchar scalar = convert_uchar_sat_rte(scalar_);
  82
  83     if (x >= dst_cols || y >= dst_rows) return;
  84
  85     /* { M0, M1, M2 }
  86      * { M3, M4, M5 }
  87      */
  88
  89     float4 nx, ny;
  90     nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2];
  91     ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5];
  92
  93     float4 s, t;
  94     s = round((nx - floor(nx)) * 32.0f) / 32.0f;
  95     t = round((ny - floor(ny)) * 32.0f) / 32.0f;
  96
  97     short4 tx, ty;
  98     tx = convert_short4_sat_rtn(nx);
  99     ty = convert_short4_sat_rtn(ny);
 100
 101     uchar4 pix[4];
 102     pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar);
 103     pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar);
 104     pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar);
 105     pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar);
 106
 107     float4 tl, tr, bl, br;
 108     tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0));
 109     tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1));
 110     bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2));
 111     br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3));
 112
 113     float4 pixel;
 114     pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t;
 115
 116     int dst_index = x + y * dst_step + dst_offset;
 117     vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index);
 118 }
 119
 120 __constant float coeffs[128] =
 121     { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,
 122     -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,
 123     -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,
 124     -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,
 125     -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,
 126     -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,
 127     -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,
 128     -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,
 129     -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,
 130     -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,
 131     -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };
 132
 133 uchar4 read_pixels_cubic(__global const uchar * src, int tx, int ty,
 134                          int src_offset, int src_step, int src_cols, int src_rows, uchar scalar)
 135 {
 136     uchar4 pix;
 137
 138     if (tx >= 0 && (tx + 3) < src_cols && ty >= 0 && ty < src_rows)
 139     {
 140         pix = vload4(0, src + src_offset + ty * src_step + tx);
 141     }
 142     else
 143     {
 144         pix.s0 = GET_VAL((tx + 0), ty);
 145         pix.s1 = GET_VAL((tx + 1), ty);
 146         pix.s2 = GET_VAL((tx + 2), ty);
 147         pix.s3 = GET_VAL((tx + 3), ty);
 148     }
 149
 150     return pix;
 151 }
 152
 153 __kernel void warpAffine_cubic_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
 154                                   __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 155                                   __constant float * M, ST scalar_)
 156 {
 157     int x = get_global_id(0) * 4;
 158     int y = get_global_id(1);
 159     uchar scalar = convert_uchar_sat_rte(scalar_);
 160
 161     if (x >= dst_cols || y >= dst_rows) return;
 162
 163     /* { M0, M1, M2 }
 164      * { M3, M4, M5 }
 165      */
 166
 167     float4 nx, ny;
 168     nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2];
 169     ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5];
 170
 171     int4 ax, ay;
 172     ax = convert_int4_sat_rte((nx - floor(nx)) * 32.0f) & 31;
 173     ay = convert_int4_sat_rte((ny - floor(ny)) * 32.0f) & 31;
 174
 175     int4 tx, ty;
 176     int4 delta_x, delta_y;
 177
 178     delta_x = select((int4)1, (int4)0, ((nx - floor(nx))) * 64 > 63);
 179     delta_y = select((int4)1, (int4)0, ((ny - floor(ny))) * 64 > 63);
 180
 181     tx = convert_int4_sat_rtn(nx) - delta_x;
 182     ty = convert_int4_sat_rtn(ny) - delta_y;
 183
 184     __constant float * coeffs_x, * coeffs_y;
 185     float4 sum = (float4)0.0f;
 186     uchar4 pix;
 187     float xsum;
 188
 189     coeffs_x = coeffs + (ax.s0 << 2);
 190     coeffs_y = coeffs + (ay.s0 << 2);
 191     for (int i = 0; i < 4; i++)
 192     {
 193         pix = read_pixels_cubic(src, tx.s0, ty.s0 + i, src_offset, src_step, src_cols, src_rows, scalar);
 194         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 195         sum.s0 = fma(xsum, coeffs_y[i], sum.s0);
 196     }
 197
 198     coeffs_x = coeffs + (ax.s1 << 2);
 199     coeffs_y = coeffs + (ay.s1 << 2);
 200     for (int i = 0; i < 4; i++)
 201     {
 202         pix = read_pixels_cubic(src, tx.s1, ty.s1 + i, src_offset, src_step, src_cols, src_rows, scalar);
 203         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 204         sum.s1 = fma(xsum, coeffs_y[i], sum.s1);
 205     }
 206
 207     coeffs_x = coeffs + (ax.s2 << 2);
 208     coeffs_y = coeffs + (ay.s2 << 2);
 209     for (int i = 0; i < 4; i++)
 210     {
 211         pix = read_pixels_cubic(src, tx.s2, ty.s2 + i, src_offset, src_step, src_cols, src_rows, scalar);
 212         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 213         sum.s2 = fma(xsum, coeffs_y[i], sum.s2);
 214     }
 215
 216     coeffs_x = coeffs + (ax.s3 << 2);
 217     coeffs_y = coeffs + (ay.s3 << 2);
 218     for (int i = 0; i < 4; i++)
 219     {
 220         pix = read_pixels_cubic(src, tx.s3, ty.s3 + i, src_offset, src_step, src_cols, src_rows, scalar);
 221         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 222         sum.s3 = fma(xsum, coeffs_y[i], sum.s3);
 223     }
 224
 225     int dst_index = x + y * dst_step + dst_offset;
 226     vstore4(convert_uchar4_sat_rte(sum), 0, dst + dst_index);
 227 }
 228
 229 __kernel void warpPerspective_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
 230                                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 231                                          __constant float * M, ST scalar_)
 232 {
 233     int x = get_global_id(0) * 4;
 234     int y = get_global_id(1);
 235     uchar scalar = convert_uchar_sat_rte(scalar_);
 236
 237     if (x >= dst_cols || y >= dst_rows) return;
 238
 239     /* { M0, M1, M2 }
 240      * { M3, M4, M5 }
 241      * { M6, M7, M8 }
 242      */
 243
 244     float4 nx, ny, nz;
 245     nx = M[0] * convert_float4(vec_offset + (short4)(x)) +
 246          M[1] * convert_float4((short4)y) + M[2];
 247
 248     ny = M[3] * convert_float4(vec_offset + (short4)(x)) +
 249          M[4] * convert_float4((short4)y) + M[5];
 250
 251     nz = M[6] * convert_float4(vec_offset + (short4)(x)) +
 252          M[7] * convert_float4((short4)y) + M[8];
 253
 254     short4 new_x, new_y;
 255     float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
 256     new_x = convert_short4_sat_rte(nx * fz);
 257     new_y = convert_short4_sat_rte(ny * fz);
 258
 259     uchar4 pix = (uchar4)scalar;
 260
 261     pix.s0 = GET_VAL(new_x.s0, new_y.s0);
 262     pix.s1 = GET_VAL(new_x.s1, new_y.s1);
 263     pix.s2 = GET_VAL(new_x.s2, new_y.s2);
 264     pix.s3 = GET_VAL(new_x.s3, new_y.s3);
 265
 266     int dst_index = x + y * dst_step + dst_offset;
 267
 268     vstore4(pix, 0,  dst + dst_index);
 269 }
 270
 271 __kernel void warpPerspective_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
 272                                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 273                                         __constant float * M, ST scalar_)
 274 {
 275     int x = get_global_id(0) * 4;
 276     int y = get_global_id(1);
 277     uchar scalar = convert_uchar_sat_rte(scalar_);
 278
 279     if (x >= dst_cols || y >= dst_rows) return;
 280
 281     /* { M0, M1, M2 }
 282      * { M3, M4, M5 }
 283      * { M6, M7, M8 }
 284      */
 285
 286     float4 nx, ny, nz;
 287     nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2];
 288
 289     ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5];
 290
 291     nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8];
 292
 293     float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
 294
 295     nx = nx * fz;
 296     ny = ny * fz;
 297
 298     float4 s, t;
 299     s = round((nx - floor(nx)) * 32.0f) / (float4)32.0f;
 300     t = round((ny - floor(ny)) * 32.0f) / (float4)32.0f;
 301
 302     short4 tx, ty;
 303     tx = convert_short4_sat_rtn(nx);
 304     ty = convert_short4_sat_rtn(ny);
 305
 306     uchar4 pix[4];
 307     pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar);
 308     pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar);
 309     pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar);
 310     pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar);
 311
 312     float4 tl, tr, bl, br;
 313     tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0));
 314     tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1));
 315     bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2));
 316     br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3));
 317
 318     float4 pixel;
 319     pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t;
 320
 321     int dst_index = x + y * dst_step + dst_offset;
 322     vstore4(convert_uchar4_sat_rte(pixel), 0,  dst + dst_index);
 323 }
 324
 325 __kernel void warpPerspective_cubic_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
 326                                        __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 327                                        __constant float * M, ST scalar_)
 328 {
 329     int x = get_global_id(0) * 4;
 330     int y = get_global_id(1);
 331     uchar scalar = convert_uchar_sat_rte(scalar_);
 332
 333     if (x >= dst_cols || y >= dst_rows) return;
 334
 335     /* { M0, M1, M2 }
 336      * { M3, M4, M5 }
 337      * { M6, M7, M8 }
 338      */
 339
 340     float4 nx, ny, nz;
 341     nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2];
 342
 343     ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5];
 344
 345     nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8];
 346
 347     float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
 348
 349     nx = nx * fz;
 350     ny = ny * fz;
 351
 352     int4 ax, ay;
 353     ax = convert_int4_sat_rte((nx - floor(nx)) * 32.0f) & 31;
 354     ay = convert_int4_sat_rte((ny - floor(ny)) * 32.0f) & 31;
 355
 356     int4 tx, ty;
 357     int4 delta_x, delta_y;
 358
 359     delta_x = select((int4)1, (int4)0, ((nx - floor(nx))) * 64 > 63);
 360     delta_y = select((int4)1, (int4)0, ((ny - floor(ny))) * 64 > 63);
 361
 362     tx = convert_int4_sat_rtn(nx) - delta_x;
 363     ty = convert_int4_sat_rtn(ny) - delta_y;
 364
 365     __constant float * coeffs_x, * coeffs_y;
 366     float4 sum = (float4)0.0f;
 367     uchar4 pix;
 368     float xsum;
 369
 370     coeffs_x = coeffs + (ax.s0 << 2);
 371     coeffs_y = coeffs + (ay.s0 << 2);
 372     for (int i = 0; i < 4; i++)
 373     {
 374         pix = read_pixels_cubic(src, tx.s0, ty.s0 + i, src_offset, src_step, src_cols, src_rows, scalar);
 375         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 376         sum.s0 = fma(xsum, coeffs_y[i], sum.s0);
 377     }
 378
 379     coeffs_x = coeffs + (ax.s1 << 2);
 380     coeffs_y = coeffs + (ay.s1 << 2);
 381     for (int i = 0; i < 4; i++)
 382     {
 383         pix = read_pixels_cubic(src, tx.s1, ty.s1 + i, src_offset, src_step, src_cols, src_rows, scalar);
 384         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 385         sum.s1 = fma(xsum, coeffs_y[i], sum.s1);
 386     }
 387
 388     coeffs_x = coeffs + (ax.s2 << 2);
 389     coeffs_y = coeffs + (ay.s2 << 2);
 390     for (int i = 0; i < 4; i++)
 391     {
 392         pix = read_pixels_cubic(src, tx.s2, ty.s2 + i, src_offset, src_step, src_cols, src_rows, scalar);
 393         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 394         sum.s2 = fma(xsum, coeffs_y[i], sum.s2);
 395     }
 396
 397     coeffs_x = coeffs + (ax.s3 << 2);
 398     coeffs_y = coeffs + (ay.s3 << 2);
 399     for (int i = 0; i < 4; i++)
 400     {
 401         pix = read_pixels_cubic(src, tx.s3, ty.s3 + i, src_offset, src_step, src_cols, src_rows, scalar);
 402         xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 403         sum.s3 = fma(xsum, coeffs_y[i], sum.s3);
 404     }
 405
 406     int dst_index = x + y * dst_step + dst_offset;
 407     vstore4(convert_uchar4_sat_rte(sum), 0, dst + dst_index);
 408 }