CLAHE Python bindings
[profile/ivi/opencv.git] / modules / ocl / src / opencl / pyrlk.cl
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Dachuan Zhao, dachuan@multicorewareinc.com
19 //    Yao Wang, bitwangyaoyao@gmail.com
20 //
21 // Redistribution and use in source and binary forms, with or without modification,
22 // are permitted provided that the following conditions are met:
23 //
24 //   * Redistribution's of source code must retain the above copyright notice,
25 //     this list of conditions and the following disclaimer.
26 //
27 //   * Redistribution's in binary form must reproduce the above copyright notice,
28 //     this list of conditions and the following disclaimer in the documentation
29 //     and/or other oclMaterials provided with the distribution.
30 //
31 //   * The name of the copyright holders may not be used to endorse or promote products
32 //     derived from this software without specific prior written permission.
33 //
34 // This software is provided by the copyright holders and contributors as is and
35 // any express or implied warranties, including, but not limited to, the implied
36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
37 // In no event shall the Intel Corporation or contributors be liable for any direct,
38 // indirect, incidental, special, exemplary, or consequential damages
39 // (including, but not limited to, procurement of substitute goods or services;
40 // loss of use, data, or profits; or business interruption) however caused
41 // and on any theory of liability, whether in contract, strict liability,
42 // or tort (including negligence or otherwise) arising in any way out of
43 // the use of this software, even if advised of the possibility of such damage.
44 //
45 //M*/
46
47 //#pragma OPENCL EXTENSION cl_amd_printf : enable
48
49 #define BUFFER  64
50 #ifndef WAVE_SIZE
51 #define WAVE_SIZE 1
52 #endif
53 #ifdef CPU
54 void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
55 {
56     smem1[tid] = val1;
57     smem2[tid] = val2;
58     smem3[tid] = val3;
59     barrier(CLK_LOCAL_MEM_FENCE);
60
61     if (tid < 32)
62     {
63         smem1[tid] += smem1[tid + 32];
64         smem2[tid] += smem2[tid + 32];
65         smem3[tid] += smem3[tid + 32];
66     }
67     barrier(CLK_LOCAL_MEM_FENCE);
68
69     if (tid < 16)
70     {
71         smem1[tid] += smem1[tid + 16];
72         smem2[tid] += smem2[tid + 16];
73         smem3[tid] += smem3[tid + 16];
74     }
75     barrier(CLK_LOCAL_MEM_FENCE);
76
77     if (tid < 8)
78     {
79         smem1[tid] += smem1[tid + 8];
80         smem2[tid] += smem2[tid + 8];
81         smem3[tid] += smem3[tid + 8];
82     }
83     barrier(CLK_LOCAL_MEM_FENCE);
84
85     if (tid < 4)
86     {
87         smem1[tid] += smem1[tid + 4];
88         smem2[tid] += smem2[tid + 4];
89         smem3[tid] += smem3[tid + 4];
90     }
91     barrier(CLK_LOCAL_MEM_FENCE);
92
93     if (tid < 2)
94     {
95         smem1[tid] += smem1[tid + 2];
96         smem2[tid] += smem2[tid + 2];
97         smem3[tid] += smem3[tid + 2];
98     }
99     barrier(CLK_LOCAL_MEM_FENCE);
100
101     if (tid < 1)
102     {
103         smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
104         smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
105         smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
106     }
107     barrier(CLK_LOCAL_MEM_FENCE);
108 }
109
110 void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
111 {
112     smem1[tid] = val1;
113     smem2[tid] = val2;
114     barrier(CLK_LOCAL_MEM_FENCE);
115
116     if (tid < 32)
117     {
118         smem1[tid] += smem1[tid + 32];
119         smem2[tid] += smem2[tid + 32];
120     }
121     barrier(CLK_LOCAL_MEM_FENCE);
122
123     if (tid < 16)
124     {
125         smem1[tid] += smem1[tid + 16];
126         smem2[tid] += smem2[tid + 16];
127     }
128     barrier(CLK_LOCAL_MEM_FENCE);
129
130     if (tid < 8)
131     {
132         smem1[tid] += smem1[tid + 8];
133         smem2[tid] += smem2[tid + 8];
134     }
135     barrier(CLK_LOCAL_MEM_FENCE);
136
137     if (tid < 4)
138     {
139         smem1[tid] += smem1[tid + 4];
140         smem2[tid] += smem2[tid + 4];
141     }
142     barrier(CLK_LOCAL_MEM_FENCE);
143
144     if (tid < 2)
145     {
146         smem1[tid] += smem1[tid + 2];
147         smem2[tid] += smem2[tid + 2];
148     }
149     barrier(CLK_LOCAL_MEM_FENCE);
150
151     if (tid < 1)
152     {
153         smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
154         smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
155     }
156     barrier(CLK_LOCAL_MEM_FENCE);
157 }
158
159 void reduce1(float val1, volatile __local float* smem1, int tid)
160 {
161     smem1[tid] = val1;
162     barrier(CLK_LOCAL_MEM_FENCE);
163
164     if (tid < 32)
165     {
166         smem1[tid] += smem1[tid + 32];
167     }
168     barrier(CLK_LOCAL_MEM_FENCE);
169
170     if (tid < 16)
171     {
172         smem1[tid] += smem1[tid + 16];
173     }
174     barrier(CLK_LOCAL_MEM_FENCE);
175
176     if (tid < 8)
177     {
178         smem1[tid] += smem1[tid + 8];
179     }
180     barrier(CLK_LOCAL_MEM_FENCE);
181
182     if (tid < 4)
183     {
184         smem1[tid] += smem1[tid + 4];
185     }
186     barrier(CLK_LOCAL_MEM_FENCE);
187
188     if (tid < 2)
189     {
190         smem1[tid] += smem1[tid + 2];
191     }
192     barrier(CLK_LOCAL_MEM_FENCE);
193
194     if (tid < 1)
195     {
196         smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
197     }
198     barrier(CLK_LOCAL_MEM_FENCE);
199 }
200 #else
201 void reduce3(float val1, float val2, float val3, 
202 __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
203 {
204     smem1[tid] = val1;
205     smem2[tid] = val2;
206     smem3[tid] = val3;
207     barrier(CLK_LOCAL_MEM_FENCE);
208
209     if (tid < 32)
210     {
211         smem1[tid] += smem1[tid + 32];
212         smem2[tid] += smem2[tid + 32];
213         smem3[tid] += smem3[tid + 32];
214 #if WAVE_SIZE < 32
215         } barrier(CLK_LOCAL_MEM_FENCE);
216         if (tid < 16) {
217 #endif
218         smem1[tid] += smem1[tid + 16];
219         smem2[tid] += smem2[tid + 16];
220         smem3[tid] += smem3[tid + 16];
221 #if WAVE_SIZE <16
222         } barrier(CLK_LOCAL_MEM_FENCE);
223         if (tid < 8) {
224 #endif
225         smem1[tid] += smem1[tid + 8];
226         smem2[tid] += smem2[tid + 8];
227         smem3[tid] += smem3[tid + 8];
228
229         smem1[tid] += smem1[tid + 4];
230         smem2[tid] += smem2[tid + 4];
231         smem3[tid] += smem3[tid + 4];
232
233         smem1[tid] += smem1[tid + 2];
234         smem2[tid] += smem2[tid + 2];
235         smem3[tid] += smem3[tid + 2];
236
237         smem1[tid] += smem1[tid + 1];
238         smem2[tid] += smem2[tid + 1];
239         smem3[tid] += smem3[tid + 1];
240     }
241 }
242
243 void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
244 {
245     smem1[tid] = val1;
246     smem2[tid] = val2;
247     barrier(CLK_LOCAL_MEM_FENCE);
248
249     if (tid < 32)
250     {
251         smem1[tid] += smem1[tid + 32];
252         smem2[tid] += smem2[tid + 32];
253 #if WAVE_SIZE < 32
254         } barrier(CLK_LOCAL_MEM_FENCE);
255         if (tid < 16) {
256 #endif
257         smem1[tid] += smem1[tid + 16];
258         smem2[tid] += smem2[tid + 16];
259 #if WAVE_SIZE <16
260         } barrier(CLK_LOCAL_MEM_FENCE);
261         if (tid < 8) {
262 #endif
263         smem1[tid] += smem1[tid + 8];
264         smem2[tid] += smem2[tid + 8];
265
266         smem1[tid] += smem1[tid + 4];
267         smem2[tid] += smem2[tid + 4];
268
269         smem1[tid] += smem1[tid + 2];
270         smem2[tid] += smem2[tid + 2];
271
272         smem1[tid] += smem1[tid + 1];
273         smem2[tid] += smem2[tid + 1];
274     }
275 }
276
277 void reduce1(float val1, __local volatile float* smem1, int tid)
278 {
279     smem1[tid] = val1;
280     barrier(CLK_LOCAL_MEM_FENCE);
281
282     if (tid < 32)
283     {
284         smem1[tid] += smem1[tid + 32];
285 #if WAVE_SIZE < 32
286         } barrier(CLK_LOCAL_MEM_FENCE);
287         if (tid < 16) {
288 #endif
289         smem1[tid] += smem1[tid + 16];
290 #if WAVE_SIZE <16
291         } barrier(CLK_LOCAL_MEM_FENCE);
292         if (tid < 8) {
293 #endif
294         smem1[tid] += smem1[tid + 8];
295         smem1[tid] += smem1[tid + 4];
296         smem1[tid] += smem1[tid + 2];
297         smem1[tid] += smem1[tid + 1];
298     }
299 }
300 #endif
301
302 #define SCALE (1.0f / (1 << 20))
303 #define THRESHOLD       0.01f
304
305 // Image read mode
306 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
307
308 void SetPatch(image2d_t I, float x, float y,
309                                 float* Pch, float* Dx, float* Dy,
310                                 float* A11, float* A12, float* A22)
311 {
312             *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
313
314             float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
315                              (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
316
317             float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
318                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
319
320
321             *Dx = dIdx;
322             *Dy = dIdy;
323
324             *A11 += dIdx * dIdx;
325             *A12 += dIdx * dIdy;
326             *A22 += dIdy * dIdy;
327 }
328
329 void GetPatch(image2d_t J, float x, float y,
330                                 float* Pch, float* Dx, float* Dy,
331                                 float* b1, float* b2)
332 {
333                 float J_val = read_imagef(J, sampler, (float2)(x, y)).x;
334                 float diff = (J_val - *Pch) * 32.0f;
335                 *b1 += diff**Dx;
336                 *b2 += diff**Dy;
337 }
338
339 void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
340 {
341         float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
342         *errval += fabs(diff);
343 }
344
345 void SetPatch4(image2d_t I, const float x, const float y,
346                                 float4* Pch, float4* Dx, float4* Dy,
347                                 float* A11, float* A12, float* A22)
348 {
349             *Pch = read_imagef(I, sampler, (float2)(x, y));
350
351             float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
352                              (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
353
354             float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
355                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
356
357
358             *Dx = dIdx;
359             *Dy = dIdy;
360                         float4 sqIdx = dIdx * dIdx;
361                         *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
362                         sqIdx = dIdx * dIdy;
363                         *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
364                         sqIdx = dIdy * dIdy;
365                         *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
366 }
367
368 void GetPatch4(image2d_t J, const float x, const float y,
369                                 const float4* Pch, const float4* Dx, const float4* Dy,
370                                 float* b1, float* b2)
371 {
372                 float4 J_val = read_imagef(J, sampler, (float2)(x, y));
373                 float4 diff = (J_val - *Pch) * 32.0f;
374                                 float4 xdiff = diff* *Dx;
375                                 *b1 += xdiff.x + xdiff.y + xdiff.z;
376                                 xdiff = diff* *Dy;
377                                 *b2 += xdiff.x + xdiff.y + xdiff.z;
378 }
379
380 void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
381 {
382         float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
383         *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
384 }
385
386 #define GRIDSIZE        3
387 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
388     __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
389         const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
390 {
391 #ifdef CPU
392     __local float smem1[BUFFER+1];
393     __local float smem2[BUFFER+1];
394     __local float smem3[BUFFER+1];
395 #else
396     __local float smem1[BUFFER];
397     __local float smem2[BUFFER];
398     __local float smem3[BUFFER];
399 #endif
400
401         unsigned int xid=get_local_id(0);
402         unsigned int yid=get_local_id(1);
403         unsigned int gid=get_group_id(0);
404         unsigned int xsize=get_local_size(0);
405         unsigned int ysize=get_local_size(1);
406         int xBase, yBase, i, j, k;
407
408         float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
409
410     const int tid = mad24(yid, xsize, xid);
411
412     float2 prevPt = prevPts[gid] / (float2)(1 << level);
413
414     if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
415     {
416         if (tid == 0 && level == 0)
417         {
418             status[gid] = 0;
419         }
420
421         return;
422     }
423     prevPt -= c_halfWin;
424
425     // extract the patch from the first image, compute covariation matrix of derivatives
426
427     float A11 = 0;
428     float A12 = 0;
429     float A22 = 0;
430
431     float I_patch[GRIDSIZE][GRIDSIZE];
432     float dIdx_patch[GRIDSIZE][GRIDSIZE];
433     float dIdy_patch[GRIDSIZE][GRIDSIZE];
434
435         yBase=yid;
436         {
437                 xBase=xid;
438                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
439                                         &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
440                                         &A11, &A12, &A22);
441
442
443                 xBase+=xsize;
444                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
445                                         &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
446                                         &A11, &A12, &A22);
447
448                 xBase+=xsize;
449                 if(xBase<c_winSize_x)
450                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
451                                         &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
452                                         &A11, &A12, &A22);
453         }
454         yBase+=ysize;
455         {
456                 xBase=xid;
457                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
458                                         &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
459                                         &A11, &A12, &A22);
460
461
462                 xBase+=xsize;
463                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
464                                         &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
465                                         &A11, &A12, &A22);
466
467                 xBase+=xsize;
468                 if(xBase<c_winSize_x)
469                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
470                                         &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
471                                         &A11, &A12, &A22);
472         }
473         yBase+=ysize;
474         if(yBase<c_winSize_y)
475         {
476                 xBase=xid;
477                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
478                                         &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
479                                         &A11, &A12, &A22);
480
481
482                 xBase+=xsize;
483                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
484                                         &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
485                                         &A11, &A12, &A22);
486
487                 xBase+=xsize;
488                 if(xBase<c_winSize_x)
489                 SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
490                                         &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
491                                         &A11, &A12, &A22);
492         }
493
494     reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
495     barrier(CLK_LOCAL_MEM_FENCE);
496
497 #ifdef CPU
498     A11 = smem1[BUFFER];
499     A12 = smem2[BUFFER];
500     A22 = smem3[BUFFER];
501 #else
502     A11 = smem1[0];
503     A12 = smem2[0];
504     A22 = smem3[0];
505 #endif
506
507     float D = A11 * A22 - A12 * A12;
508
509     if (D < 1.192092896e-07f)
510     {
511         if (tid == 0 && level == 0)
512             status[gid] = 0;
513
514         return;
515     }
516
517     A11 /= D;
518     A12 /= D;
519     A22 /= D;
520
521     prevPt = nextPts[gid] * 2.0f - c_halfWin;
522
523     for (k = 0; k < c_iters; ++k)
524     {
525         if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)
526         {
527             if (tid == 0 && level == 0)
528                 status[gid] = 0;
529             return;
530         }
531
532         float b1 = 0;
533         float b2 = 0;
534
535                 yBase=yid;
536                 {
537                         xBase=xid;
538                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
539                                                 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
540                                                 &b1, &b2);
541
542
543                         xBase+=xsize;
544                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
545                                                 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
546                                                 &b1, &b2);
547
548                         xBase+=xsize;
549                         if(xBase<c_winSize_x)
550                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
551                                                 &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
552                                                 &b1, &b2);
553                 }
554                 yBase+=ysize;
555                 {
556                         xBase=xid;
557                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
558                                                 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
559                                                 &b1, &b2);
560
561
562                         xBase+=xsize;
563                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
564                                                 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
565                                                 &b1, &b2);
566
567                         xBase+=xsize;
568                         if(xBase<c_winSize_x)
569                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
570                                                 &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
571                                                 &b1, &b2);
572                 }
573                 yBase+=ysize;
574                 if(yBase<c_winSize_y)
575                 {
576                         xBase=xid;
577                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
578                                                 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
579                                                 &b1, &b2);
580
581
582                         xBase+=xsize;
583                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
584                                                 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
585                                                 &b1, &b2);
586
587                         xBase+=xsize;
588                         if(xBase<c_winSize_x)
589                         GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
590                                                 &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
591                                                 &b1, &b2);
592                 }
593
594         reduce2(b1, b2, smem1, smem2, tid);
595         barrier(CLK_LOCAL_MEM_FENCE);
596
597 #ifdef CPU
598         b1 = smem1[BUFFER];
599         b2 = smem2[BUFFER];
600 #else
601         b1 = smem1[0];
602         b2 = smem2[0];
603 #endif
604
605         float2 delta;
606         delta.x = A12 * b2 - A22 * b1;
607         delta.y = A12 * b1 - A11 * b2;
608
609                 prevPt += delta;
610
611         if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
612             break;
613     }
614
615     D = 0.0f;
616     if (calcErr)
617     {
618                 yBase=yid;
619                 {
620                         xBase=xid;
621                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
622                                                 &I_patch[0][0], &D);
623
624
625                         xBase+=xsize;
626                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
627                                                 &I_patch[0][1], &D);
628
629                         xBase+=xsize;
630                         if(xBase<c_winSize_x)
631                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
632                                                 &I_patch[0][2], &D);
633                 }
634                 yBase+=ysize;
635                 {
636                         xBase=xid;
637                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
638                                                 &I_patch[1][0], &D);
639
640
641                         xBase+=xsize;
642                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
643                                                 &I_patch[1][1], &D);
644
645                         xBase+=xsize;
646                         if(xBase<c_winSize_x)
647                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
648                                                 &I_patch[1][2], &D);
649                 }
650                 yBase+=ysize;
651                 if(yBase<c_winSize_y)
652                 {
653                         xBase=xid;
654                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
655                                                 &I_patch[2][0], &D);
656
657
658                         xBase+=xsize;
659                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
660                                                 &I_patch[2][1], &D);
661
662                         xBase+=xsize;
663                         if(xBase<c_winSize_x)
664                         GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
665                                                 &I_patch[2][2], &D);
666                 }
667
668         reduce1(D, smem1, tid);
669     }
670
671     if (tid == 0)
672     {
673                 prevPt += c_halfWin;
674
675         nextPts[gid] = prevPt;
676
677         if (calcErr)
678 #ifdef CPU
679             err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y);
680 #else
681             err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
682 #endif
683     }
684 }
685
686
687 __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
688     __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
689         const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
690 {
691 #ifdef CPU
692      __local float smem1[BUFFER+1];
693      __local float smem2[BUFFER+1];
694      __local float smem3[BUFFER+1];
695 #else
696      __local float smem1[BUFFER];
697      __local float smem2[BUFFER];
698      __local float smem3[BUFFER];
699 #endif
700
701         unsigned int xid=get_local_id(0);
702         unsigned int yid=get_local_id(1);
703         unsigned int gid=get_group_id(0);
704         unsigned int xsize=get_local_size(0);
705         unsigned int ysize=get_local_size(1);
706         int xBase, yBase, i, j, k;
707
708         float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
709
710     const int tid = mad24(yid, xsize, xid);
711
712     float2 nextPt = prevPts[gid]/(float2)(1<<level);
713
714     if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
715     {
716         if (tid == 0 && level == 0)
717         {
718             status[gid] = 0;
719         }
720
721         return;
722     }
723
724         nextPt -= c_halfWin;
725
726     // extract the patch from the first image, compute covariation matrix of derivatives
727
728     float A11 = 0.0f;
729     float A12 = 0.0f;
730     float A22 = 0.0f;
731
732     float4 I_patch[8];
733     float4 dIdx_patch[8];
734     float4 dIdy_patch[8];
735         float4 I_add,Dx_add,Dy_add;
736
737         yBase=yid;
738         {
739                 xBase=xid;
740                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
741                                         &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
742                                         &A11, &A12, &A22);
743
744
745                 xBase+=xsize;
746                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
747                                         &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
748                                         &A11, &A12, &A22);
749
750                 xBase+=xsize;
751                 if(xBase<c_winSize_x)
752                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
753                                         &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
754                                         &A11, &A12, &A22);
755
756         }
757         yBase+=ysize;
758         {
759                 xBase=xid;
760                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
761                                         &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
762                                         &A11, &A12, &A22);
763
764
765                 xBase+=xsize;
766                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
767                                         &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
768                                         &A11, &A12, &A22);
769
770                 xBase+=xsize;
771                 if(xBase<c_winSize_x)
772                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
773                                         &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
774                                         &A11, &A12, &A22);
775         }
776         yBase+=ysize;
777         if(yBase<c_winSize_y)
778         {
779                 xBase=xid;
780                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
781                                         &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
782                                         &A11, &A12, &A22);
783
784
785                 xBase+=xsize;
786                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
787                                         &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
788                                         &A11, &A12, &A22);
789
790                 xBase+=xsize;
791                 if(xBase<c_winSize_x)
792                 SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
793                                         &I_add, &Dx_add, &Dy_add,
794                                         &A11, &A12, &A22);
795         }
796
797     reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
798     barrier(CLK_LOCAL_MEM_FENCE);
799
800 #ifdef CPU
801     A11 = smem1[BUFFER];
802     A12 = smem2[BUFFER];
803     A22 = smem3[BUFFER];
804 #else
805     A11 = smem1[0];
806     A12 = smem2[0];
807     A22 = smem3[0];
808 #endif
809
810     float D = A11 * A22 - A12 * A12;
811
812     if (D < 1.192092896e-07f)
813     {
814         if (tid == 0 && level == 0)
815             status[gid] = 0;
816
817         return;
818     }
819
820     A11 /= D;
821     A12 /= D;
822     A22 /= D;
823
824         nextPt = nextPts[gid] * 2.0f - c_halfWin;
825
826     for (k = 0; k < c_iters; ++k)
827     {
828         if (nextPt.x < -c_halfWin.x || nextPt.x >= cols || nextPt.y < -c_halfWin.y || nextPt.y >= rows)
829         {
830             if (tid == 0 && level == 0)
831                 status[gid] = 0;
832             return;
833         }
834
835         float b1 = 0;
836         float b2 = 0;
837
838                 yBase=yid;
839                 {
840                         xBase=xid;
841                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
842                                                 &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
843                                                 &b1, &b2);
844
845
846                         xBase+=xsize;
847                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
848                                                 &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
849                                                 &b1, &b2);
850
851                         xBase+=xsize;
852                         if(xBase<c_winSize_x)
853                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
854                                                 &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
855                                                 &b1, &b2);
856                 }
857                 yBase+=ysize;
858                 {
859                         xBase=xid;
860                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
861                                                 &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
862                                                 &b1, &b2);
863
864
865                         xBase+=xsize;
866                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
867                                                 &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
868                                                 &b1, &b2);
869
870                         xBase+=xsize;
871                         if(xBase<c_winSize_x)
872                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
873                                                 &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
874                                                 &b1, &b2);
875                 }
876                 yBase+=ysize;
877                 if(yBase<c_winSize_y)
878                 {
879                         xBase=xid;
880                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
881                                                 &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
882                                                 &b1, &b2);
883
884
885                         xBase+=xsize;
886                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
887                                                 &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
888                                                 &b1, &b2);
889
890                         xBase+=xsize;
891                         if(xBase<c_winSize_x)
892                         GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
893                                                 &I_add, &Dx_add, &Dy_add,
894                                                 &b1, &b2);
895                 }
896
897         reduce2(b1, b2, smem1, smem2, tid);
898         barrier(CLK_LOCAL_MEM_FENCE);
899
900 #ifdef CPU
901         b1 = smem1[BUFFER];
902         b2 = smem2[BUFFER];
903 #else
904         b1 = smem1[0];
905         b2 = smem2[0];
906 #endif
907
908         float2 delta;
909         delta.x = A12 * b2 - A22 * b1;
910         delta.y = A12 * b1 - A11 * b2;
911
912                 nextPt +=delta;
913
914         if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
915             break;
916     }
917
918     D = 0.0f;
919     if (calcErr)
920     {
921                 yBase=yid;
922                 {
923                         xBase=xid;
924                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
925                                                 &I_patch[0], &D);
926
927
928                         xBase+=xsize;
929                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
930                                                 &I_patch[1], &D);
931
932                         xBase+=xsize;
933                         if(xBase<c_winSize_x)
934                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
935                                                 &I_patch[2], &D);
936                 }
937                 yBase+=ysize;
938                 {
939                         xBase=xid;
940                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
941                                                 &I_patch[3], &D);
942
943
944                         xBase+=xsize;
945                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
946                                                 &I_patch[4], &D);
947
948                         xBase+=xsize;
949                         if(xBase<c_winSize_x)
950                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
951                                                 &I_patch[5], &D);
952                 }
953                 yBase+=ysize;
954                 if(yBase<c_winSize_y)
955                 {
956                         xBase=xid;
957                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
958                                                 &I_patch[6], &D);
959
960
961                         xBase+=xsize;
962                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
963                                                 &I_patch[7], &D);
964
965                         xBase+=xsize;
966                         if(xBase<c_winSize_x)
967                         GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
968                                                 &I_add, &D);
969                 }
970
971         reduce1(D, smem1, tid);
972     }
973
974     if (tid == 0)
975     {
976                 nextPt += c_halfWin;
977         nextPts[gid] = nextPt;
978
979         if (calcErr)
980 #ifdef CPU
981             err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y);
982 #else
983             err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
984 #endif
985     }
986 }
987
988 __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
989     const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
990 {
991         int c_halfWin_x = (c_winSize_x - 1) / 2;
992         int c_halfWin_y = (c_winSize_y - 1) / 2;
993
994     const int patchWidth  = get_local_size(0) + 2 * c_halfWin_x;
995     const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
996
997     __local int smem[8192];
998
999     __local int* I_patch = smem;
1000     __local int* dIdx_patch = I_patch + patchWidth * patchHeight;
1001     __local int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
1002
1003     const int xBase = get_group_id(0) * get_local_size(0);
1004     const int yBase = get_group_id(1) * get_local_size(1);
1005
1006         sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
1007
1008     for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
1009     {
1010         for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
1011         {
1012             float x = xBase - c_halfWin_x + j + 0.5f;
1013             float y = yBase - c_halfWin_y + i + 0.5f;
1014
1015             I_patch[i * patchWidth + j] = read_imagei(I, sampleri, (float2)(x, y)).x;
1016
1017             // Sharr Deriv
1018
1019             dIdx_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x+1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
1020                                             (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x);
1021
1022             dIdy_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y+1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
1023                                             (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x);
1024         }
1025     }
1026     barrier(CLK_LOCAL_MEM_FENCE);
1027
1028     // extract the patch from the first image, compute covariation matrix of derivatives
1029
1030     const int x = get_global_id(0);
1031     const int y = get_global_id(1);
1032
1033     if (x >= cols || y >= rows)
1034         return;
1035
1036     int A11i = 0;
1037     int A12i = 0;
1038     int A22i = 0;
1039
1040     for (int i = 0; i < c_winSize_y; ++i)
1041     {
1042         for (int j = 0; j < c_winSize_x; ++j)
1043         {
1044             int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1045             int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1046
1047             A11i += dIdx * dIdx;
1048             A12i += dIdx * dIdy;
1049             A22i += dIdy * dIdy;
1050         }
1051     }
1052
1053     float A11 = A11i;
1054     float A12 = A12i;
1055     float A22 = A22i;
1056
1057     float D = A11 * A22 - A12 * A12;
1058
1059     //if (calcErr && GET_MIN_EIGENVALS)
1060     //    (err + y * errStep)[x] = minEig;
1061
1062     if (D < 1.192092896e-07f)
1063     {
1064         //if (calcErr)
1065         //    err(y, x) = 3.402823466e+38f;
1066
1067         return;
1068     }
1069
1070     D = 1.f / D;
1071
1072     A11 *= D;
1073     A12 *= D;
1074     A22 *= D;
1075
1076     float2 nextPt;
1077     nextPt.x = x + prevU[y/2 * prevUStep / 4 + x/2] * 2.0f;
1078     nextPt.y = y + prevV[y/2 * prevVStep / 4 + x/2] * 2.0f;
1079
1080     for (int k = 0; k < c_iters; ++k)
1081     {
1082         if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
1083         {
1084             //if (calcErr)
1085             //    err(y, x) = 3.402823466e+38f;
1086
1087             return;
1088         }
1089
1090         int b1 = 0;
1091         int b2 = 0;
1092
1093         for (int i = 0; i < c_winSize_y; ++i)
1094         {
1095             for (int j = 0; j < c_winSize_x; ++j)
1096             {
1097                 int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
1098                 int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
1099
1100                 int diff = (iJ - iI) * 32;
1101
1102                 int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1103                 int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
1104
1105                 b1 += diff * dIdx;
1106                 b2 += diff * dIdy;
1107             }
1108         }
1109
1110         float2 delta;
1111         delta.x = A12 * b2 - A22 * b1;
1112         delta.y = A12 * b1 - A11 * b2;
1113
1114         nextPt.x += delta.x;
1115         nextPt.y += delta.y;
1116
1117         if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f)
1118             break;
1119     }
1120
1121     u[y * uStep / 4 + x] = nextPt.x - x;
1122     v[y * vStep / 4 + x] = nextPt.y - y;
1123
1124     if (calcErr)
1125     {
1126         int errval = 0;
1127
1128         for (int i = 0; i < c_winSize_y; ++i)
1129         {
1130             for (int j = 0; j < c_winSize_x; ++j)
1131             {
1132                 int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
1133                 int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
1134
1135                 errval += abs(iJ - iI);
1136             }
1137         }
1138
1139         //err[y * errStep / 4 + x] = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
1140     }
1141 }