1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 __constant short4 vec_offset = (short4)(0, 1, 2, 3);
7 #define GET_VAL(x, y) ((x) < 0 || (x) >= src_cols || (y) < 0 || (y) >= src_rows) ? scalar : src[src_offset + y * src_step + x]
9 __kernel void warpAffine_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
10 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
11 __constant float * M, ST scalar_)
13 int x = get_global_id(0) * 4;
14 int y = get_global_id(1);
15 uchar scalar = convert_uchar_sat_rte(scalar_);
17 if (x >= dst_cols || y >= dst_rows) return;
24 new_x = convert_short4_sat_rte(M[0] * convert_float4(vec_offset + (short4)(x)) +
25 M[1] * convert_float4((short4)y) + M[2]);
27 new_y = convert_short4_sat_rte(M[3] * convert_float4(vec_offset + (short4)(x)) +
28 M[4] * convert_float4((short4)y) + M[5]);
30 uchar4 pix = (uchar4)scalar;
32 pix.s0 = GET_VAL(new_x.s0, new_y.s0);
33 pix.s1 = GET_VAL(new_x.s1, new_y.s1);
34 pix.s2 = GET_VAL(new_x.s2, new_y.s2);
35 pix.s3 = GET_VAL(new_x.s3, new_y.s3);
37 int dst_index = x + y * dst_step + dst_offset;
39 vstore4(pix, 0, dst + dst_index);
42 uchar4 read_pixels(__global const uchar * src, short tx, short ty,
43 int src_offset, int src_step, int src_cols, int
44 src_rows, uchar scalar)
52 if (tx >= 0 && (tx + 1) < src_cols && ty >= 0 && ty < src_rows)
54 pt = vload2(0, src + src_offset + ty * src_step + tx);
58 pt.s0 = GET_VAL(tx, ty);
59 pt.s1 = GET_VAL(bx, ty);
62 if (tx >= 0 && (tx + 1) < src_cols && by >= 0 && by < src_rows)
64 pb = vload2(0, src + src_offset + by * src_step + tx);
68 pb.s0 = GET_VAL(tx, by);
69 pb.s1 = GET_VAL(bx, by);
72 return (uchar4)(pt, pb);
75 __kernel void warpAffine_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
76 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
77 __constant float * M, ST scalar_)
79 int x = get_global_id(0) * 4;
80 int y = get_global_id(1);
81 uchar scalar = convert_uchar_sat_rte(scalar_);
83 if (x >= dst_cols || y >= dst_rows) return;
90 nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2];
91 ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5];
94 s = round((nx - floor(nx)) * 32.0f) / 32.0f;
95 t = round((ny - floor(ny)) * 32.0f) / 32.0f;
98 tx = convert_short4_sat_rtn(nx);
99 ty = convert_short4_sat_rtn(ny);
102 pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar);
103 pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar);
104 pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar);
105 pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar);
107 float4 tl, tr, bl, br;
108 tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0));
109 tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1));
110 bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2));
111 br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3));
114 pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t;
116 int dst_index = x + y * dst_step + dst_offset;
117 vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index);
120 __constant float coeffs[128] =
121 { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,
122 -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,
123 -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,
124 -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,
125 -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,
126 -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,
127 -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,
128 -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,
129 -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,
130 -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,
131 -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };
133 uchar4 read_pixels_cubic(__global const uchar * src, int tx, int ty,
134 int src_offset, int src_step, int src_cols, int src_rows, uchar scalar)
138 if (tx >= 0 && (tx + 3) < src_cols && ty >= 0 && ty < src_rows)
140 pix = vload4(0, src + src_offset + ty * src_step + tx);
144 pix.s0 = GET_VAL((tx + 0), ty);
145 pix.s1 = GET_VAL((tx + 1), ty);
146 pix.s2 = GET_VAL((tx + 2), ty);
147 pix.s3 = GET_VAL((tx + 3), ty);
153 __kernel void warpAffine_cubic_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
154 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
155 __constant float * M, ST scalar_)
157 int x = get_global_id(0) * 4;
158 int y = get_global_id(1);
159 uchar scalar = convert_uchar_sat_rte(scalar_);
161 if (x >= dst_cols || y >= dst_rows) return;
168 nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2];
169 ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5];
172 ax = convert_int4_sat_rte((nx - floor(nx)) * 32.0f) & 31;
173 ay = convert_int4_sat_rte((ny - floor(ny)) * 32.0f) & 31;
176 int4 delta_x, delta_y;
178 delta_x = select((int4)1, (int4)0, ((nx - floor(nx))) * 64 > 63);
179 delta_y = select((int4)1, (int4)0, ((ny - floor(ny))) * 64 > 63);
181 tx = convert_int4_sat_rtn(nx) - delta_x;
182 ty = convert_int4_sat_rtn(ny) - delta_y;
184 __constant float * coeffs_x, * coeffs_y;
185 float4 sum = (float4)0.0f;
189 coeffs_x = coeffs + (ax.s0 << 2);
190 coeffs_y = coeffs + (ay.s0 << 2);
191 for (int i = 0; i < 4; i++)
193 pix = read_pixels_cubic(src, tx.s0, ty.s0 + i, src_offset, src_step, src_cols, src_rows, scalar);
194 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
195 sum.s0 = fma(xsum, coeffs_y[i], sum.s0);
198 coeffs_x = coeffs + (ax.s1 << 2);
199 coeffs_y = coeffs + (ay.s1 << 2);
200 for (int i = 0; i < 4; i++)
202 pix = read_pixels_cubic(src, tx.s1, ty.s1 + i, src_offset, src_step, src_cols, src_rows, scalar);
203 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
204 sum.s1 = fma(xsum, coeffs_y[i], sum.s1);
207 coeffs_x = coeffs + (ax.s2 << 2);
208 coeffs_y = coeffs + (ay.s2 << 2);
209 for (int i = 0; i < 4; i++)
211 pix = read_pixels_cubic(src, tx.s2, ty.s2 + i, src_offset, src_step, src_cols, src_rows, scalar);
212 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
213 sum.s2 = fma(xsum, coeffs_y[i], sum.s2);
216 coeffs_x = coeffs + (ax.s3 << 2);
217 coeffs_y = coeffs + (ay.s3 << 2);
218 for (int i = 0; i < 4; i++)
220 pix = read_pixels_cubic(src, tx.s3, ty.s3 + i, src_offset, src_step, src_cols, src_rows, scalar);
221 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
222 sum.s3 = fma(xsum, coeffs_y[i], sum.s3);
225 int dst_index = x + y * dst_step + dst_offset;
226 vstore4(convert_uchar4_sat_rte(sum), 0, dst + dst_index);
229 __kernel void warpPerspective_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
230 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
231 __constant float * M, ST scalar_)
233 int x = get_global_id(0) * 4;
234 int y = get_global_id(1);
235 uchar scalar = convert_uchar_sat_rte(scalar_);
237 if (x >= dst_cols || y >= dst_rows) return;
245 nx = M[0] * convert_float4(vec_offset + (short4)(x)) +
246 M[1] * convert_float4((short4)y) + M[2];
248 ny = M[3] * convert_float4(vec_offset + (short4)(x)) +
249 M[4] * convert_float4((short4)y) + M[5];
251 nz = M[6] * convert_float4(vec_offset + (short4)(x)) +
252 M[7] * convert_float4((short4)y) + M[8];
255 float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
256 new_x = convert_short4_sat_rte(nx * fz);
257 new_y = convert_short4_sat_rte(ny * fz);
259 uchar4 pix = (uchar4)scalar;
261 pix.s0 = GET_VAL(new_x.s0, new_y.s0);
262 pix.s1 = GET_VAL(new_x.s1, new_y.s1);
263 pix.s2 = GET_VAL(new_x.s2, new_y.s2);
264 pix.s3 = GET_VAL(new_x.s3, new_y.s3);
266 int dst_index = x + y * dst_step + dst_offset;
268 vstore4(pix, 0, dst + dst_index);
271 __kernel void warpPerspective_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
272 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
273 __constant float * M, ST scalar_)
275 int x = get_global_id(0) * 4;
276 int y = get_global_id(1);
277 uchar scalar = convert_uchar_sat_rte(scalar_);
279 if (x >= dst_cols || y >= dst_rows) return;
287 nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2];
289 ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5];
291 nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8];
293 float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
299 s = round((nx - floor(nx)) * 32.0f) / (float4)32.0f;
300 t = round((ny - floor(ny)) * 32.0f) / (float4)32.0f;
303 tx = convert_short4_sat_rtn(nx);
304 ty = convert_short4_sat_rtn(ny);
307 pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar);
308 pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar);
309 pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar);
310 pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar);
312 float4 tl, tr, bl, br;
313 tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0));
314 tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1));
315 bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2));
316 br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3));
319 pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t;
321 int dst_index = x + y * dst_step + dst_offset;
322 vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index);
325 __kernel void warpPerspective_cubic_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
326 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
327 __constant float * M, ST scalar_)
329 int x = get_global_id(0) * 4;
330 int y = get_global_id(1);
331 uchar scalar = convert_uchar_sat_rte(scalar_);
333 if (x >= dst_cols || y >= dst_rows) return;
341 nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2];
343 ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5];
345 nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8];
347 float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
353 ax = convert_int4_sat_rte((nx - floor(nx)) * 32.0f) & 31;
354 ay = convert_int4_sat_rte((ny - floor(ny)) * 32.0f) & 31;
357 int4 delta_x, delta_y;
359 delta_x = select((int4)1, (int4)0, ((nx - floor(nx))) * 64 > 63);
360 delta_y = select((int4)1, (int4)0, ((ny - floor(ny))) * 64 > 63);
362 tx = convert_int4_sat_rtn(nx) - delta_x;
363 ty = convert_int4_sat_rtn(ny) - delta_y;
365 __constant float * coeffs_x, * coeffs_y;
366 float4 sum = (float4)0.0f;
370 coeffs_x = coeffs + (ax.s0 << 2);
371 coeffs_y = coeffs + (ay.s0 << 2);
372 for (int i = 0; i < 4; i++)
374 pix = read_pixels_cubic(src, tx.s0, ty.s0 + i, src_offset, src_step, src_cols, src_rows, scalar);
375 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
376 sum.s0 = fma(xsum, coeffs_y[i], sum.s0);
379 coeffs_x = coeffs + (ax.s1 << 2);
380 coeffs_y = coeffs + (ay.s1 << 2);
381 for (int i = 0; i < 4; i++)
383 pix = read_pixels_cubic(src, tx.s1, ty.s1 + i, src_offset, src_step, src_cols, src_rows, scalar);
384 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
385 sum.s1 = fma(xsum, coeffs_y[i], sum.s1);
388 coeffs_x = coeffs + (ax.s2 << 2);
389 coeffs_y = coeffs + (ay.s2 << 2);
390 for (int i = 0; i < 4; i++)
392 pix = read_pixels_cubic(src, tx.s2, ty.s2 + i, src_offset, src_step, src_cols, src_rows, scalar);
393 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
394 sum.s2 = fma(xsum, coeffs_y[i], sum.s2);
397 coeffs_x = coeffs + (ax.s3 << 2);
398 coeffs_y = coeffs + (ay.s3 << 2);
399 for (int i = 0; i < 4; i++)
401 pix = read_pixels_cubic(src, tx.s3, ty.s3 + i, src_offset, src_step, src_cols, src_rows, scalar);
402 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
403 sum.s3 = fma(xsum, coeffs_y[i], sum.s3);
406 int dst_index = x + y * dst_step + dst_offset;
407 vstore4(convert_uchar4_sat_rte(sum), 0, dst + dst_index);