1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Sen Liu, swjtuls1987@126.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other oclMaterials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
47 #define tx get_local_id(0)
48 #define ty get_local_id(1)
49 #define bx get_group_id(0)
50 #define bdx get_local_size(0)
53 #define MAX_KSIZE_HALF 100
59 __kernel void polynomialExpansion(__global float * dst,
60 __global __const float * src,
61 __global __const float * c_g,
62 __global __const float * c_xg,
63 __global __const float * c_xxg,
66 const int height, const int width,
67 int dstStep, int srcStep)
69 const int y = get_global_id(1);
70 const int x = bx * (bdx - 2*polyN) + tx - polyN;
72 dstStep /= sizeof(*dst);
73 srcStep /= sizeof(*src);
76 __local float *row = smem + tx;
78 if (y < height && y >= 0)
80 xWarped = min(max(x, 0), width - 1);
82 row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];
87 for (int k = 1; k <= polyN; ++k)
89 float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];
90 float t1 = src[mad24(min(y + k, height - 1), srcStep, xWarped)];
92 row[0] += c_g[k] * (t0 + t1);
93 row[bdx] += c_xg[k] * (t1 - t0);
94 row[2*bdx] += c_xxg[k] * (t0 + t1);
98 barrier(CLK_LOCAL_MEM_FENCE);
100 if (y < height && y >= 0 && tx >= polyN && tx + polyN < bdx && x < width)
102 float b1 = c_g[0] * row[0];
103 float b3 = c_g[0] * row[bdx];
104 float b5 = c_g[0] * row[2*bdx];
105 float b2 = 0, b4 = 0, b6 = 0;
108 for (int k = 1; k <= polyN; ++k)
110 b1 += (row[k] + row[-k]) * c_g[k];
111 b4 += (row[k] + row[-k]) * c_xxg[k];
112 b2 += (row[k] - row[-k]) * c_xg[k];
113 b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
114 b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
115 b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
118 dst[mad24(y, dstStep, xWarped)] = b3*ig.s0;
119 dst[mad24(height + y, dstStep, xWarped)] = b2*ig.s0;
120 dst[mad24(2*height + y, dstStep, xWarped)] = b1*ig.s1 + b5*ig.s2;
121 dst[mad24(3*height + y, dstStep, xWarped)] = b1*ig.s1 + b4*ig.s2;
122 dst[mad24(4*height + y, dstStep, xWarped)] = b6*ig.s3;
126 inline int idx_row_low(const int y, const int last_row)
128 return abs(y) % (last_row + 1);
131 inline int idx_row_high(const int y, const int last_row)
133 return abs(last_row - abs(last_row - y)) % (last_row + 1);
136 inline int idx_row(const int y, const int last_row)
138 return idx_row_low(idx_row_high(y, last_row), last_row);
141 inline int idx_col_low(const int x, const int last_col)
143 return abs(x) % (last_col + 1);
146 inline int idx_col_high(const int x, const int last_col)
148 return abs(last_col - abs(last_col - x)) % (last_col + 1);
151 inline int idx_col(const int x, const int last_col)
153 return idx_col_low(idx_col_high(x, last_col), last_col);
156 __kernel void gaussianBlur(__global float * dst,
157 __global const float * src,
158 __global const float * c_gKer,
159 __local float * smem,
160 const int height, const int width,
161 int dstStep, int srcStep,
164 const int y = get_global_id(1);
165 const int x = get_global_id(0);
167 dstStep /= sizeof(*dst);
168 srcStep /= sizeof(*src);
170 __local float *row = smem + ty * (bdx + 2*ksizeHalf);
175 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
177 int xExt = (int)(bx * bdx) + i - ksizeHalf;
178 xExt = idx_col(xExt, width - 1);
179 row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];
180 for (int j = 1; j <= ksizeHalf; ++j)
181 row[i] += (src[mad24(idx_row_low(y - j, height - 1), srcStep, xExt)]
182 + src[mad24(idx_row_high(y + j, height - 1), srcStep, xExt)]) * c_gKer[j];
186 barrier(CLK_LOCAL_MEM_FENCE);
188 if (y < height && y >= 0 && x < width && x >= 0)
191 row += tx + ksizeHalf;
192 float res = row[0] * c_gKer[0];
193 for (int i = 1; i <= ksizeHalf; ++i)
194 res += (row[-i] + row[i]) * c_gKer[i];
196 dst[mad24(y, dstStep, x)] = res;
200 __constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };
202 __kernel void updateMatrices(__global float * M,
203 __global const float * flowx, __global const float * flowy,
204 __global const float * R0, __global const float * R1,
205 const int height, const int width,
206 int mStep, int xStep, int yStep, int R0Step, int R1Step)
208 const int y = get_global_id(1);
209 const int x = get_global_id(0);
212 xStep /= sizeof(*flowx);
213 yStep /= sizeof(*flowy);
214 R0Step /= sizeof(*R0);
215 R1Step /= sizeof(*R1);
217 if (y < height && y >= 0 && x < width && x >= 0)
219 float dx = flowx[mad24(y, xStep, x)];
220 float dy = flowy[mad24(y, yStep, x)];
224 int x1 = convert_int(floor(fx));
225 int y1 = convert_int(floor(fy));
229 float r2, r3, r4, r5, r6;
231 if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
233 float a00 = (1.f - fx) * (1.f - fy);
234 float a01 = fx * (1.f - fy);
235 float a10 = (1.f - fx) * fy;
238 r2 = a00 * R1[mad24(y1, R1Step, x1)] +
239 a01 * R1[mad24(y1, R1Step, x1 + 1)] +
240 a10 * R1[mad24(y1 + 1, R1Step, x1)] +
241 a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];
243 r3 = a00 * R1[mad24(height + y1, R1Step, x1)] +
244 a01 * R1[mad24(height + y1, R1Step, x1 + 1)] +
245 a10 * R1[mad24(height + y1 + 1, R1Step, x1)] +
246 a11 * R1[mad24(height + y1 + 1, R1Step, x1 + 1)];
248 r4 = a00 * R1[mad24(2*height + y1, R1Step, x1)] +
249 a01 * R1[mad24(2*height + y1, R1Step, x1 + 1)] +
250 a10 * R1[mad24(2*height + y1 + 1, R1Step, x1)] +
251 a11 * R1[mad24(2*height + y1 + 1, R1Step, x1 + 1)];
253 r5 = a00 * R1[mad24(3*height + y1, R1Step, x1)] +
254 a01 * R1[mad24(3*height + y1, R1Step, x1 + 1)] +
255 a10 * R1[mad24(3*height + y1 + 1, R1Step, x1)] +
256 a11 * R1[mad24(3*height + y1 + 1, R1Step, x1 + 1)];
258 r6 = a00 * R1[mad24(4*height + y1, R1Step, x1)] +
259 a01 * R1[mad24(4*height + y1, R1Step, x1 + 1)] +
260 a10 * R1[mad24(4*height + y1 + 1, R1Step, x1)] +
261 a11 * R1[mad24(4*height + y1 + 1, R1Step, x1 + 1)];
263 r4 = (R0[mad24(2*height + y, R0Step, x)] + r4) * 0.5f;
264 r5 = (R0[mad24(3*height + y, R0Step, x)] + r5) * 0.5f;
265 r6 = (R0[mad24(4*height + y, R0Step, x)] + r6) * 0.25f;
270 r4 = R0[mad24(2*height + y, R0Step, x)];
271 r5 = R0[mad24(3*height + y, R0Step, x)];
272 r6 = R0[mad24(4*height + y, R0Step, x)] * 0.5f;
275 r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;
276 r3 = (R0[mad24(height + y, R0Step, x)] - r3) * 0.5f;
282 c_border[min(x, BORDER_SIZE)] *
283 c_border[min(y, BORDER_SIZE)] *
284 c_border[min(width - x - 1, BORDER_SIZE)] *
285 c_border[min(height - y - 1, BORDER_SIZE)];
293 M[mad24(y, mStep, x)] = r4*r4 + r6*r6;
294 M[mad24(height + y, mStep, x)] = (r4 + r5)*r6;
295 M[mad24(2*height + y, mStep, x)] = r5*r5 + r6*r6;
296 M[mad24(3*height + y, mStep, x)] = r4*r2 + r6*r3;
297 M[mad24(4*height + y, mStep, x)] = r6*r2 + r5*r3;
301 __kernel void boxFilter5(__global float * dst,
302 __global const float * src,
303 __local float * smem,
304 const int height, const int width,
305 int dstStep, int srcStep,
308 const int y = get_global_id(1);
309 const int x = get_global_id(0);
311 const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
312 const int smw = bdx + 2*ksizeHalf; // shared memory "width"
313 __local float *row = smem + 5 * ty * smw;
315 dstStep /= sizeof(*dst);
316 srcStep /= sizeof(*src);
321 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
323 int xExt = (int)(bx * bdx) + i - ksizeHalf;
324 xExt = min(max(xExt, 0), width - 1);
327 for (int k = 0; k < 5; ++k)
328 row[k*smw + i] = src[mad24(k*height + y, srcStep, xExt)];
330 for (int j = 1; j <= ksizeHalf; ++j)
332 for (int k = 0; k < 5; ++k)
334 src[mad24(k*height + max(y - j, 0), srcStep, xExt)] +
335 src[mad24(k*height + min(y + j, height - 1), srcStep, xExt)];
339 barrier(CLK_LOCAL_MEM_FENCE);
341 if (y < height && y >= 0 && x < width && x >= 0)
345 row += tx + ksizeHalf;
349 for (int k = 0; k < 5; ++k)
352 for (int i = 1; i <= ksizeHalf; ++i)
354 for (int k = 0; k < 5; ++k)
355 res[k] += row[k*smw - i] + row[k*smw + i];
358 for (int k = 0; k < 5; ++k)
359 dst[mad24(k*height + y, dstStep, x)] = res[k] * boxAreaInv;
363 __kernel void updateFlow(__global float4 * flowx, __global float4 * flowy,
364 __global const float4 * M,
365 const int height, const int width,
366 int xStep, int yStep, int mStep)
368 const int y = get_global_id(1);
369 const int x = get_global_id(0);
371 xStep /= sizeof(*flowx);
372 yStep /= sizeof(*flowy);
375 if (y < height && y >= 0 && x < width && x >= 0)
377 float4 g11 = M[mad24(y, mStep, x)];
378 float4 g12 = M[mad24(height + y, mStep, x)];
379 float4 g22 = M[mad24(2*height + y, mStep, x)];
380 float4 h1 = M[mad24(3*height + y, mStep, x)];
381 float4 h2 = M[mad24(4*height + y, mStep, x)];
383 float4 detInv = (float4)(1.f) / (g11*g22 - g12*g12 + (float4)(1e-3f));
385 flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;
386 flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;
390 __kernel void gaussianBlur5(__global float * dst,
391 __global const float * src,
392 __global const float * c_gKer,
393 __local float * smem,
394 const int height, const int width,
395 int dstStep, int srcStep,
398 const int y = get_global_id(1);
399 const int x = get_global_id(0);
401 const int smw = bdx + 2*ksizeHalf; // shared memory "width"
402 __local volatile float *row = smem + 5 * ty * smw;
404 dstStep /= sizeof(*dst);
405 srcStep /= sizeof(*src);
410 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
412 int xExt = (int)(bx * bdx) + i - ksizeHalf;
413 xExt = idx_col(xExt, width - 1);
416 for (int k = 0; k < 5; ++k)
417 row[k*smw + i] = src[mad24(k*height + y, srcStep, xExt)] * c_gKer[0];
419 for (int j = 1; j <= ksizeHalf; ++j)
421 for (int k = 0; k < 5; ++k)
423 (src[mad24(k*height + idx_row_low(y - j, height - 1), srcStep, xExt)] +
424 src[mad24(k*height + idx_row_high(y + j, height - 1), srcStep, xExt)]) * c_gKer[j];
428 barrier(CLK_LOCAL_MEM_FENCE);
430 if (y < height && y >= 0 && x < width && x >= 0)
434 row += tx + ksizeHalf;
438 for (int k = 0; k < 5; ++k)
439 res[k] = row[k*smw] * c_gKer[0];
441 for (int i = 1; i <= ksizeHalf; ++i)
443 for (int k = 0; k < 5; ++k)
444 res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
447 for (int k = 0; k < 5; ++k)
448 dst[mad24(k*height + y, dstStep, x)] = res[k];