1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Dachuan Zhao, dachuan@multicorewareinc.com
19 // Yao Wang, bitwangyaoyao@gmail.com
21 // Redistribution and use in source and binary forms, with or without modification,
22 // are permitted provided that the following conditions are met:
24 // * Redistribution's of source code must retain the above copyright notice,
25 // this list of conditions and the following disclaimer.
27 // * Redistribution's in binary form must reproduce the above copyright notice,
28 // this list of conditions and the following disclaimer in the documentation
29 // and/or other oclMaterials provided with the distribution.
31 // * The name of the copyright holders may not be used to endorse or promote products
32 // derived from this software without specific prior written permission.
34 // This software is provided by the copyright holders and contributors as is and
35 // any express or implied warranties, including, but not limited to, the implied
36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
37 // In no event shall the Intel Corporation or contributors be liable for any direct,
38 // indirect, incidental, special, exemplary, or consequential damages
39 // (including, but not limited to, procurement of substitute goods or services;
40 // loss of use, data, or profits; or business interruption) however caused
41 // and on any theory of liability, whether in contract, strict liability,
42 // or tort (including negligence or otherwise) arising in any way out of
43 // the use of this software, even if advised of the possibility of such damage.
47 //#pragma OPENCL EXTENSION cl_amd_printf : enable
54 void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
59 barrier(CLK_LOCAL_MEM_FENCE);
63 smem1[tid] += smem1[tid + 32];
64 smem2[tid] += smem2[tid + 32];
65 smem3[tid] += smem3[tid + 32];
67 barrier(CLK_LOCAL_MEM_FENCE);
71 smem1[tid] += smem1[tid + 16];
72 smem2[tid] += smem2[tid + 16];
73 smem3[tid] += smem3[tid + 16];
75 barrier(CLK_LOCAL_MEM_FENCE);
79 smem1[tid] += smem1[tid + 8];
80 smem2[tid] += smem2[tid + 8];
81 smem3[tid] += smem3[tid + 8];
83 barrier(CLK_LOCAL_MEM_FENCE);
87 smem1[tid] += smem1[tid + 4];
88 smem2[tid] += smem2[tid + 4];
89 smem3[tid] += smem3[tid + 4];
91 barrier(CLK_LOCAL_MEM_FENCE);
95 smem1[tid] += smem1[tid + 2];
96 smem2[tid] += smem2[tid + 2];
97 smem3[tid] += smem3[tid + 2];
99 barrier(CLK_LOCAL_MEM_FENCE);
103 smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
104 smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
105 smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
107 barrier(CLK_LOCAL_MEM_FENCE);
110 void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
114 barrier(CLK_LOCAL_MEM_FENCE);
118 smem1[tid] += smem1[tid + 32];
119 smem2[tid] += smem2[tid + 32];
121 barrier(CLK_LOCAL_MEM_FENCE);
125 smem1[tid] += smem1[tid + 16];
126 smem2[tid] += smem2[tid + 16];
128 barrier(CLK_LOCAL_MEM_FENCE);
132 smem1[tid] += smem1[tid + 8];
133 smem2[tid] += smem2[tid + 8];
135 barrier(CLK_LOCAL_MEM_FENCE);
139 smem1[tid] += smem1[tid + 4];
140 smem2[tid] += smem2[tid + 4];
142 barrier(CLK_LOCAL_MEM_FENCE);
146 smem1[tid] += smem1[tid + 2];
147 smem2[tid] += smem2[tid + 2];
149 barrier(CLK_LOCAL_MEM_FENCE);
153 smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
154 smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
156 barrier(CLK_LOCAL_MEM_FENCE);
159 void reduce1(float val1, volatile __local float* smem1, int tid)
162 barrier(CLK_LOCAL_MEM_FENCE);
166 smem1[tid] += smem1[tid + 32];
168 barrier(CLK_LOCAL_MEM_FENCE);
172 smem1[tid] += smem1[tid + 16];
174 barrier(CLK_LOCAL_MEM_FENCE);
178 smem1[tid] += smem1[tid + 8];
180 barrier(CLK_LOCAL_MEM_FENCE);
184 smem1[tid] += smem1[tid + 4];
186 barrier(CLK_LOCAL_MEM_FENCE);
190 smem1[tid] += smem1[tid + 2];
192 barrier(CLK_LOCAL_MEM_FENCE);
196 smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
198 barrier(CLK_LOCAL_MEM_FENCE);
201 void reduce3(float val1, float val2, float val3,
202 __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
207 barrier(CLK_LOCAL_MEM_FENCE);
211 smem1[tid] += smem1[tid + 32];
212 smem2[tid] += smem2[tid + 32];
213 smem3[tid] += smem3[tid + 32];
215 } barrier(CLK_LOCAL_MEM_FENCE);
218 smem1[tid] += smem1[tid + 16];
219 smem2[tid] += smem2[tid + 16];
220 smem3[tid] += smem3[tid + 16];
222 } barrier(CLK_LOCAL_MEM_FENCE);
225 smem1[tid] += smem1[tid + 8];
226 smem2[tid] += smem2[tid + 8];
227 smem3[tid] += smem3[tid + 8];
229 smem1[tid] += smem1[tid + 4];
230 smem2[tid] += smem2[tid + 4];
231 smem3[tid] += smem3[tid + 4];
233 smem1[tid] += smem1[tid + 2];
234 smem2[tid] += smem2[tid + 2];
235 smem3[tid] += smem3[tid + 2];
237 smem1[tid] += smem1[tid + 1];
238 smem2[tid] += smem2[tid + 1];
239 smem3[tid] += smem3[tid + 1];
243 void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
247 barrier(CLK_LOCAL_MEM_FENCE);
251 smem1[tid] += smem1[tid + 32];
252 smem2[tid] += smem2[tid + 32];
254 } barrier(CLK_LOCAL_MEM_FENCE);
257 smem1[tid] += smem1[tid + 16];
258 smem2[tid] += smem2[tid + 16];
260 } barrier(CLK_LOCAL_MEM_FENCE);
263 smem1[tid] += smem1[tid + 8];
264 smem2[tid] += smem2[tid + 8];
266 smem1[tid] += smem1[tid + 4];
267 smem2[tid] += smem2[tid + 4];
269 smem1[tid] += smem1[tid + 2];
270 smem2[tid] += smem2[tid + 2];
272 smem1[tid] += smem1[tid + 1];
273 smem2[tid] += smem2[tid + 1];
277 void reduce1(float val1, __local volatile float* smem1, int tid)
280 barrier(CLK_LOCAL_MEM_FENCE);
284 smem1[tid] += smem1[tid + 32];
286 } barrier(CLK_LOCAL_MEM_FENCE);
289 smem1[tid] += smem1[tid + 16];
291 } barrier(CLK_LOCAL_MEM_FENCE);
294 smem1[tid] += smem1[tid + 8];
295 smem1[tid] += smem1[tid + 4];
296 smem1[tid] += smem1[tid + 2];
297 smem1[tid] += smem1[tid + 1];
302 #define SCALE (1.0f / (1 << 20))
303 #define THRESHOLD 0.01f
306 __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
308 void SetPatch(image2d_t I, float x, float y,
309 float* Pch, float* Dx, float* Dy,
310 float* A11, float* A12, float* A22)
312 *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
314 float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
315 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
317 float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
318 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
329 void GetPatch(image2d_t J, float x, float y,
330 float* Pch, float* Dx, float* Dy,
331 float* b1, float* b2)
333 float J_val = read_imagef(J, sampler, (float2)(x, y)).x;
334 float diff = (J_val - *Pch) * 32.0f;
339 void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
341 float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
342 *errval += fabs(diff);
345 void SetPatch4(image2d_t I, const float x, const float y,
346 float4* Pch, float4* Dx, float4* Dy,
347 float* A11, float* A12, float* A22)
349 *Pch = read_imagef(I, sampler, (float2)(x, y));
351 float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
352 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
354 float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
355 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
360 float4 sqIdx = dIdx * dIdx;
361 *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
363 *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
365 *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
368 void GetPatch4(image2d_t J, const float x, const float y,
369 const float4* Pch, const float4* Dx, const float4* Dy,
370 float* b1, float* b2)
372 float4 J_val = read_imagef(J, sampler, (float2)(x, y));
373 float4 diff = (J_val - *Pch) * 32.0f;
374 float4 xdiff = diff* *Dx;
375 *b1 += xdiff.x + xdiff.y + xdiff.z;
377 *b2 += xdiff.x + xdiff.y + xdiff.z;
380 void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
382 float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
383 *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
387 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
388 __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
389 const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
392 __local float smem1[BUFFER+1];
393 __local float smem2[BUFFER+1];
394 __local float smem3[BUFFER+1];
396 __local float smem1[BUFFER];
397 __local float smem2[BUFFER];
398 __local float smem3[BUFFER];
401 unsigned int xid=get_local_id(0);
402 unsigned int yid=get_local_id(1);
403 unsigned int gid=get_group_id(0);
404 unsigned int xsize=get_local_size(0);
405 unsigned int ysize=get_local_size(1);
406 int xBase, yBase, i, j, k;
408 float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
410 const int tid = mad24(yid, xsize, xid);
412 float2 prevPt = prevPts[gid] / (float2)(1 << level);
414 if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
416 if (tid == 0 && level == 0)
425 // extract the patch from the first image, compute covariation matrix of derivatives
431 float I_patch[GRIDSIZE][GRIDSIZE];
432 float dIdx_patch[GRIDSIZE][GRIDSIZE];
433 float dIdy_patch[GRIDSIZE][GRIDSIZE];
438 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
439 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
444 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
445 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
449 if(xBase<c_winSize_x)
450 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
451 &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
457 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
458 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
463 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
464 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
468 if(xBase<c_winSize_x)
469 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
470 &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
474 if(yBase<c_winSize_y)
477 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
478 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
483 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
484 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
488 if(xBase<c_winSize_x)
489 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
490 &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
494 reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
495 barrier(CLK_LOCAL_MEM_FENCE);
507 float D = A11 * A22 - A12 * A12;
509 if (D < 1.192092896e-07f)
511 if (tid == 0 && level == 0)
521 prevPt = nextPts[gid] * 2.0f - c_halfWin;
523 for (k = 0; k < c_iters; ++k)
525 if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)
527 if (tid == 0 && level == 0)
538 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
539 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
544 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
545 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
549 if(xBase<c_winSize_x)
550 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
551 &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
557 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
558 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
563 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
564 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
568 if(xBase<c_winSize_x)
569 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
570 &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
574 if(yBase<c_winSize_y)
577 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
578 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
583 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
584 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
588 if(xBase<c_winSize_x)
589 GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
590 &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
594 reduce2(b1, b2, smem1, smem2, tid);
595 barrier(CLK_LOCAL_MEM_FENCE);
606 delta.x = A12 * b2 - A22 * b1;
607 delta.y = A12 * b1 - A11 * b2;
611 if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
621 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
626 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
630 if(xBase<c_winSize_x)
631 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
637 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
642 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
646 if(xBase<c_winSize_x)
647 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
651 if(yBase<c_winSize_y)
654 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
659 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
663 if(xBase<c_winSize_x)
664 GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
668 reduce1(D, smem1, tid);
675 nextPts[gid] = prevPt;
679 err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y);
681 err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
687 __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
688 __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
689 const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
692 __local float smem1[BUFFER+1];
693 __local float smem2[BUFFER+1];
694 __local float smem3[BUFFER+1];
696 __local float smem1[BUFFER];
697 __local float smem2[BUFFER];
698 __local float smem3[BUFFER];
701 unsigned int xid=get_local_id(0);
702 unsigned int yid=get_local_id(1);
703 unsigned int gid=get_group_id(0);
704 unsigned int xsize=get_local_size(0);
705 unsigned int ysize=get_local_size(1);
706 int xBase, yBase, i, j, k;
708 float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
710 const int tid = mad24(yid, xsize, xid);
712 float2 nextPt = prevPts[gid]/(float2)(1<<level);
714 if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
716 if (tid == 0 && level == 0)
726 // extract the patch from the first image, compute covariation matrix of derivatives
733 float4 dIdx_patch[8];
734 float4 dIdy_patch[8];
735 float4 I_add,Dx_add,Dy_add;
740 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
741 &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
746 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
747 &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
751 if(xBase<c_winSize_x)
752 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
753 &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
760 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
761 &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
766 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
767 &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
771 if(xBase<c_winSize_x)
772 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
773 &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
777 if(yBase<c_winSize_y)
780 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
781 &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
786 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
787 &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
791 if(xBase<c_winSize_x)
792 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
793 &I_add, &Dx_add, &Dy_add,
797 reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
798 barrier(CLK_LOCAL_MEM_FENCE);
810 float D = A11 * A22 - A12 * A12;
812 if (D < 1.192092896e-07f)
814 if (tid == 0 && level == 0)
824 nextPt = nextPts[gid] * 2.0f - c_halfWin;
826 for (k = 0; k < c_iters; ++k)
828 if (nextPt.x < -c_halfWin.x || nextPt.x >= cols || nextPt.y < -c_halfWin.y || nextPt.y >= rows)
830 if (tid == 0 && level == 0)
841 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
842 &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
847 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
848 &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
852 if(xBase<c_winSize_x)
853 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
854 &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
860 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
861 &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
866 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
867 &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
871 if(xBase<c_winSize_x)
872 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
873 &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
877 if(yBase<c_winSize_y)
880 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
881 &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
886 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
887 &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
891 if(xBase<c_winSize_x)
892 GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
893 &I_add, &Dx_add, &Dy_add,
897 reduce2(b1, b2, smem1, smem2, tid);
898 barrier(CLK_LOCAL_MEM_FENCE);
909 delta.x = A12 * b2 - A22 * b1;
910 delta.y = A12 * b1 - A11 * b2;
914 if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
924 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
929 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
933 if(xBase<c_winSize_x)
934 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
940 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
945 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
949 if(xBase<c_winSize_x)
950 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
954 if(yBase<c_winSize_y)
957 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
962 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
966 if(xBase<c_winSize_x)
967 GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
971 reduce1(D, smem1, tid);
977 nextPts[gid] = nextPt;
981 err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y);
983 err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
988 __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
989 const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
991 int c_halfWin_x = (c_winSize_x - 1) / 2;
992 int c_halfWin_y = (c_winSize_y - 1) / 2;
994 const int patchWidth = get_local_size(0) + 2 * c_halfWin_x;
995 const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
997 __local int smem[8192];
999 __local int* I_patch = smem;
1000 __local int* dIdx_patch = I_patch + patchWidth * patchHeight;
1001 __local int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
1003 const int xBase = get_group_id(0) * get_local_size(0);
1004 const int yBase = get_group_id(1) * get_local_size(1);
1006 sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
1008 for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
1010 for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
1012 float x = xBase - c_halfWin_x + j + 0.5f;
1013 float y = yBase - c_halfWin_y + i + 0.5f;
1015 I_patch[i * patchWidth + j] = read_imagei(I, sampleri, (float2)(x, y)).x;
1019 dIdx_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x+1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
1020 (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x);
1022 dIdy_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y+1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
1023 (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x);
1026 barrier(CLK_LOCAL_MEM_FENCE);
1028 // extract the patch from the first image, compute covariation matrix of derivatives
1030 const int x = get_global_id(0);
1031 const int y = get_global_id(1);
1033 if (x >= cols || y >= rows)
1040 for (int i = 0; i < c_winSize_y; ++i)
1042 for (int j = 0; j < c_winSize_x; ++j)
1044 int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1045 int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1047 A11i += dIdx * dIdx;
1048 A12i += dIdx * dIdy;
1049 A22i += dIdy * dIdy;
1057 float D = A11 * A22 - A12 * A12;
1059 //if (calcErr && GET_MIN_EIGENVALS)
1060 // (err + y * errStep)[x] = minEig;
1062 if (D < 1.192092896e-07f)
1065 // err(y, x) = 3.402823466e+38f;
1077 nextPt.x = x + prevU[y/2 * prevUStep / 4 + x/2] * 2.0f;
1078 nextPt.y = y + prevV[y/2 * prevVStep / 4 + x/2] * 2.0f;
1080 for (int k = 0; k < c_iters; ++k)
1082 if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
1085 // err(y, x) = 3.402823466e+38f;
1093 for (int i = 0; i < c_winSize_y; ++i)
1095 for (int j = 0; j < c_winSize_x; ++j)
1097 int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
1098 int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
1100 int diff = (iJ - iI) * 32;
1102 int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1103 int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1111 delta.x = A12 * b2 - A22 * b1;
1112 delta.y = A12 * b1 - A11 * b2;
1114 nextPt.x += delta.x;
1115 nextPt.y += delta.y;
1117 if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f)
1121 u[y * uStep / 4 + x] = nextPt.x - x;
1122 v[y * vStep / 4 + x] = nextPt.y - y;
1128 for (int i = 0; i < c_winSize_y; ++i)
1130 for (int j = 0; j < c_winSize_x; ++j)
1132 int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
1133 int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
1135 errval += abs(iJ - iI);
1139 //err[y * errStep / 4 + x] = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);