added dual tvl1 optical flow gpu implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / ccomponetns.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //               For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //  * Redistributions of source code must retain the above copyright notice,
21 //    this list of conditions and the following disclaimer.
22 //
23 //  * Redistributions in binary form must reproduce the above copyright notice,
24 //    this list of conditions and the following disclaimer in the documentation
25 //    and/or other materials provided with the distribution.
26 //
27 //  * The name of the copyright holders may not be used to endorse or promote products
28 //    derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //M*/
41
42 #if !defined CUDA_DISABLER
43
44 #include <opencv2/gpu/device/common.hpp>
45 #include <opencv2/gpu/device/vec_traits.hpp>
46 #include <opencv2/gpu/device/vec_math.hpp>
47 #include <opencv2/gpu/device/emulation.hpp>
48
49 #include <iostream>
50 #include <stdio.h>
51
52 namespace cv { namespace gpu { namespace device
53 {
54     namespace ccl
55     {
56         enum
57         {
58             WARP_SIZE  = 32,
59             WARP_LOG   = 5,
60
61             CTA_SIZE_X = 32,
62             CTA_SIZE_Y = 8,
63
64             STA_SIZE_MERGE_Y = 4,
65             STA_SIZE_MERGE_X = 32,
66
67             TPB_X = 1,
68             TPB_Y = 4,
69
70             TILE_COLS = CTA_SIZE_X * TPB_X,
71             TILE_ROWS = CTA_SIZE_Y * TPB_Y
72         };
73
74         template<typename T> struct IntervalsTraits
75         {
76             typedef T elem_type;
77         };
78
79         template<> struct IntervalsTraits<unsigned char>
80         {
81             typedef int dist_type;
82             enum {ch = 1};
83         };
84
85         template<> struct IntervalsTraits<uchar3>
86         {
87             typedef int3 dist_type;
88             enum {ch = 3};
89         };
90
91         template<> struct IntervalsTraits<uchar4>
92         {
93             typedef int4 dist_type;
94             enum {ch = 4};
95         };
96
97         template<> struct IntervalsTraits<unsigned short>
98         {
99             typedef int dist_type;
100             enum {ch = 1};
101         };
102
103         template<> struct IntervalsTraits<ushort3>
104         {
105             typedef int3 dist_type;
106             enum {ch = 3};
107         };
108
109         template<> struct IntervalsTraits<ushort4>
110         {
111             typedef int4 dist_type;
112             enum {ch = 4};
113         };
114
115         template<> struct IntervalsTraits<float>
116         {
117             typedef float dist_type;
118             enum {ch = 1};
119         };
120
121         template<> struct IntervalsTraits<int>
122         {
123             typedef int dist_type;
124             enum {ch = 1};
125         };
126
127         typedef unsigned char component;
128         enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
129
130         template<typename T, int CH> struct InInterval {};
131
132         template<typename T> struct InInterval<T, 1>
133         {
134             typedef typename VecTraits<T>::elem_type E;
135             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
136             T lo, hi;
137
138             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
139             {
140                 I d = a - b;
141                 return lo <= d && d <= hi;
142             }
143         };
144
145
146         template<typename T> struct InInterval<T, 3>
147         {
148             typedef typename VecTraits<T>::elem_type E;
149             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
150             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
151             T lo, hi;
152
153             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
154             {
155                 I d = a - b;
156                 return lo.x <= d.x && d.x <= hi.x &&
157                        lo.y <= d.y && d.y <= hi.y &&
158                        lo.z <= d.z && d.z <= hi.z;
159             }
160         };
161
162         template<typename T> struct InInterval<T, 4>
163         {
164             typedef typename VecTraits<T>::elem_type E;
165             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
166             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
167             T lo, hi;
168
169             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
170             {
171                 I d = a - b;
172                 return lo.x <= d.x && d.x <= hi.x &&
173                        lo.y <= d.y && d.y <= hi.y &&
174                        lo.z <= d.z && d.z <= hi.z &&
175                        lo.w <= d.w && d.w <= hi.w;
176             }
177         };
178
179
180         template<typename T, typename F>
181         __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
182         {
183             int x = threadIdx.x + blockIdx.x * blockDim.x;
184             int y = threadIdx.y + blockIdx.y * blockDim.y;
185
186             if (x >= image.cols || y >= image.rows) return;
187
188             T intensity = image(y, x);
189             component c = 0;
190
191             if ( x > 0 && connected(intensity, image(y, x - 1)))
192                 c |= LEFT;
193
194             if ( y > 0 && connected(intensity, image(y - 1, x)))
195                 c |= UP;
196
197             if ( x - 1 < image.cols && connected(intensity, image(y, x + 1)))
198                 c |= RIGHT;
199
200             if ( y - 1 < image.rows && connected(intensity, image(y + 1, x)))
201                 c |= DOWN;
202
203             components(y, x) = c;
204         }
205
206         template< typename T>
207         void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
208         {
209             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
210             dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
211
212             typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
213
214             Int_t inInt(lo, hi);
215             computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
216
217             cudaSafeCall( cudaGetLastError() );
218             if (stream == 0)
219                 cudaSafeCall( cudaDeviceSynchronize() );
220         }
221
222         template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
223         template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
224         template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
225         template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
226         template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
227         template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
228         template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
229         template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
230
231         __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
232         {
233             int x = threadIdx.x + blockIdx.x * TILE_COLS;
234             int y = threadIdx.y + blockIdx.y * TILE_ROWS;
235
236             if (x >= edges.cols || y >= edges.rows) return;
237
238             //currently x is 1
239             int bounds = ((y + TPB_Y) < edges.rows);
240
241             __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
242             __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
243
244             int new_labels[TPB_Y][TPB_X];
245             int old_labels[TPB_Y][TPB_X];
246
247             #pragma unroll
248             for (int i = 0; i < TPB_Y; ++i)
249                 #pragma unroll
250                 for (int j = 0; j < TPB_X; ++j)
251                 {
252                     int yloc = threadIdx.y + CTA_SIZE_Y * i;
253                     int xloc = threadIdx.x + CTA_SIZE_X * j;
254                     component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
255
256                     if (!xloc) c &= ~LEFT;
257                     if (!yloc) c &= ~UP;
258
259                     if (xloc == TILE_COLS -1) c &= ~RIGHT;
260                     if (yloc == TILE_ROWS -1) c &= ~DOWN;
261
262                     new_labels[i][j] = yloc * TILE_COLS + xloc;
263                     edgesTile[yloc][xloc] = c;
264                 }
265
266             for (int k = 0; ;++k)
267             {
268                 //1. backup
269                 #pragma unroll
270                 for (int i = 0; i < TPB_Y; ++i)
271                     #pragma unroll
272                     for (int j = 0; j < TPB_X; ++j)
273                     {
274                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
275                         int xloc = threadIdx.x + CTA_SIZE_X * j;
276
277                         old_labels[i][j]       = new_labels[i][j];
278                         labelsTile[yloc][xloc] = new_labels[i][j];
279                     }
280
281                 __syncthreads();
282
283                 //2. compare local arrays
284                 #pragma unroll
285                 for (int i = 0; i < TPB_Y; ++i)
286                     #pragma unroll
287                     for (int j = 0; j < TPB_X; ++j)
288                     {
289                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
290                         int xloc = threadIdx.x + CTA_SIZE_X * j;
291
292                         component c = edgesTile[yloc][xloc];
293                         int label = new_labels[i][j];
294
295                         if (c & UP)
296                            label = ::min(label, labelsTile[yloc - 1][xloc]);
297
298                         if (c &  DOWN)
299                            label = ::min(label, labelsTile[yloc + 1][xloc]);
300
301                         if (c & LEFT)
302                            label = ::min(label, labelsTile[yloc][xloc - 1]);
303
304                         if (c & RIGHT)
305                            label = ::min(label, labelsTile[yloc][xloc + 1]);
306
307                        new_labels[i][j] = label;
308                     }
309
310                 __syncthreads();
311
312                 //3. determine: Is any value changed?
313                 int changed = 0;
314                 #pragma unroll
315                 for (int i = 0; i < TPB_Y; ++i)
316                     #pragma unroll
317                     for (int j = 0; j < TPB_X; ++j)
318                     {
319                         if (new_labels[i][j] < old_labels[i][j])
320                         {
321                             changed = 1;
322                             Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
323                         }
324                     }
325
326                 changed = Emulation::syncthreadsOr(changed);
327
328                 if (!changed)
329                     break;
330
331                 //4. Compact paths
332                 const int *labels = &labelsTile[0][0];
333                 #pragma unroll
334                 for (int i = 0; i < TPB_Y; ++i)
335                     #pragma unroll
336                     for (int j = 0; j < TPB_X; ++j)
337                     {
338                         int label = new_labels[i][j];
339
340                         while( labels[label] < label ) label = labels[label];
341
342                         new_labels[i][j] = label;
343                     }
344                 __syncthreads();
345             }
346
347             #pragma unroll
348             for (int i = 0; i < TPB_Y; ++i)
349             #pragma unroll
350                 for (int j = 0; j < TPB_X; ++j)
351                 {
352                     int label = new_labels[i][j];
353                     int yloc = label / TILE_COLS;
354                     int xloc = label - yloc * TILE_COLS;
355
356                     xloc += blockIdx.x * TILE_COLS;
357                     yloc += blockIdx.y * TILE_ROWS;
358
359                     label = yloc * edges.cols + xloc;
360                     // do it for x too.
361                     if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
362                 }
363         }
364
365         __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
366         {
367             while(1)
368             {
369                 int y = label / comps.cols;
370                 int x = label - y * comps.cols;
371
372                 int parent = comps(y, x);
373
374                 if (label == parent) break;
375
376                 label = parent;
377             }
378             return label;
379         }
380
381         __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
382         {
383             int r1 = root(comps, l1);
384             int r2 = root(comps, l2);
385
386             if (r1 == r2) return;
387
388             int mi = ::min(r1, r2);
389             int ma = ::max(r1, r2);
390
391             int y = ma / comps.cols;
392             int x = ma - y * comps.cols;
393
394             atomicMin(&comps.ptr(y)[x], mi);
395             changed = true;
396         }
397
398         __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
399             const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
400         {
401             int tid = threadIdx.y * blockDim.x + threadIdx.x;
402             int stride = blockDim.y * blockDim.x;
403
404             int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
405             int yend   = ybegin + tilesNumY * tileSizeY;
406
407             if (blockIdx.y == gridDim.y - 1)
408             {
409                 yend -= yIncomplete * tileSizeY;
410                 yend -= tileSizeY;
411                 tileSizeY = (edges.rows % tileSizeY);
412
413                 yend += tileSizeY;
414             }
415
416             int xbegin = blockIdx.x * tilesNumX * tileSizeX;
417             int xend   = xbegin + tilesNumX * tileSizeX;
418
419             if (blockIdx.x == gridDim.x - 1)
420             {
421                 if (xIncomplete) yend = ybegin;
422                 xend -= xIncomplete * tileSizeX;
423                 xend -= tileSizeX;
424                 tileSizeX = (edges.cols % tileSizeX);
425
426                 xend += tileSizeX;
427             }
428
429             if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
430             {
431                 xend = xbegin;
432             }
433
434             int tasksV = (tilesNumX - 1) * (yend - ybegin);
435             int tasksH = (tilesNumY - 1) * (xend - xbegin);
436
437             int total = tasksH + tasksV;
438
439             bool changed;
440             do
441             {
442                 changed = false;
443                 for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
444                 {
445                     if (taskIdx < tasksH)
446                     {
447                         int indexH = taskIdx;
448
449                         int row = indexH / (xend - xbegin);
450                         int col = indexH - row * (xend - xbegin);
451
452                         int y = ybegin + (row + 1) * tileSizeY;
453                         int x = xbegin + col;
454
455                         component e = edges( x, y);
456                         if (e & UP)
457                         {
458                             int lc = comps(y,x);
459                             int lu = comps(y - 1, x);
460
461                             isConnected(comps, lc, lu, changed);
462                         }
463                     }
464                     else
465                     {
466                         int indexV = taskIdx - tasksH;
467
468                         int col = indexV / (yend - ybegin);
469                         int row = indexV - col * (yend - ybegin);
470
471                         int x = xbegin + (col + 1) * tileSizeX;
472                         int y = ybegin + row;
473
474                         component e = edges(x, y);
475                         if (e & LEFT)
476                         {
477                             int lc = comps(y, x);
478                             int ll = comps(y, x - 1);
479
480                             isConnected(comps, lc, ll, changed);
481                         }
482                     }
483                 }
484             } while (Emulation::syncthreadsOr(changed));
485         }
486
487         __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
488         {
489             int x = threadIdx.x + blockIdx.x * blockDim.x;
490             int y = threadIdx.y + blockIdx.y * blockDim.y;
491
492             if( x < comps.cols && y < comps.rows)
493                 comps(y, x) = root(comps, comps(y, x));
494         }
495
496         enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
497
498         void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
499         {
500             (void) flags;
501             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
502             dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
503
504             lableTiles<<<grid, block, 0, stream>>>(edges, comps);
505             cudaSafeCall( cudaGetLastError() );
506
507             int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
508             while (grid.x > 1 || grid.y > 1)
509             {
510                 dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
511                 dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
512                 // debug log
513                 // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
514                 crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
515                 tileSizeX <<= 1;
516                 tileSizeY <<= 1;
517                 grid = mergeGrid;
518
519                 cudaSafeCall( cudaGetLastError() );
520             }
521
522             grid.x = divUp(edges.cols, block.x);
523             grid.y = divUp(edges.rows, block.y);
524             flatten<<<grid, block, 0, stream>>>(edges, comps);
525             cudaSafeCall( cudaGetLastError() );
526
527             if (stream == 0)
528                 cudaSafeCall( cudaDeviceSynchronize() );
529         }
530     }
531 } } }
532
533 #endif /* CUDA_DISABLER */