1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistributions of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistributions in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
42 #if !defined CUDA_DISABLER
44 #include <opencv2/gpu/device/common.hpp>
45 #include <opencv2/gpu/device/vec_traits.hpp>
46 #include <opencv2/gpu/device/vec_math.hpp>
47 #include <opencv2/gpu/device/emulation.hpp>
52 namespace cv { namespace gpu { namespace device
65 STA_SIZE_MERGE_X = 32,
70 TILE_COLS = CTA_SIZE_X * TPB_X,
71 TILE_ROWS = CTA_SIZE_Y * TPB_Y
74 template<typename T> struct IntervalsTraits
79 template<> struct IntervalsTraits<unsigned char>
81 typedef int dist_type;
85 template<> struct IntervalsTraits<uchar3>
87 typedef int3 dist_type;
91 template<> struct IntervalsTraits<uchar4>
93 typedef int4 dist_type;
97 template<> struct IntervalsTraits<unsigned short>
99 typedef int dist_type;
103 template<> struct IntervalsTraits<ushort3>
105 typedef int3 dist_type;
109 template<> struct IntervalsTraits<ushort4>
111 typedef int4 dist_type;
115 template<> struct IntervalsTraits<float>
117 typedef float dist_type;
121 template<> struct IntervalsTraits<int>
123 typedef int dist_type;
127 typedef unsigned char component;
128 enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
130 template<typename T, int CH> struct InInterval {};
132 template<typename T> struct InInterval<T, 1>
134 typedef typename VecTraits<T>::elem_type E;
135 __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
138 template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
141 return lo <= d && d <= hi;
146 template<typename T> struct InInterval<T, 3>
148 typedef typename VecTraits<T>::elem_type E;
149 __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
150 : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
153 template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
156 return lo.x <= d.x && d.x <= hi.x &&
157 lo.y <= d.y && d.y <= hi.y &&
158 lo.z <= d.z && d.z <= hi.z;
162 template<typename T> struct InInterval<T, 4>
164 typedef typename VecTraits<T>::elem_type E;
165 __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
166 : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
169 template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
172 return lo.x <= d.x && d.x <= hi.x &&
173 lo.y <= d.y && d.y <= hi.y &&
174 lo.z <= d.z && d.z <= hi.z &&
175 lo.w <= d.w && d.w <= hi.w;
180 template<typename T, typename F>
181 __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
183 int x = threadIdx.x + blockIdx.x * blockDim.x;
184 int y = threadIdx.y + blockIdx.y * blockDim.y;
186 if (x >= image.cols || y >= image.rows) return;
188 T intensity = image(y, x);
191 if ( x > 0 && connected(intensity, image(y, x - 1)))
194 if ( y > 0 && connected(intensity, image(y - 1, x)))
197 if ( x - 1 < image.cols && connected(intensity, image(y, x + 1)))
200 if ( y - 1 < image.rows && connected(intensity, image(y + 1, x)))
203 components(y, x) = c;
206 template< typename T>
207 void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
209 dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
210 dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
212 typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
215 computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
217 cudaSafeCall( cudaGetLastError() );
219 cudaSafeCall( cudaDeviceSynchronize() );
222 template void computeEdges<uchar> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
223 template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
224 template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
225 template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
226 template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
227 template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
228 template void computeEdges<int> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
229 template void computeEdges<float> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
231 __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
233 int x = threadIdx.x + blockIdx.x * TILE_COLS;
234 int y = threadIdx.y + blockIdx.y * TILE_ROWS;
236 if (x >= edges.cols || y >= edges.rows) return;
239 int bounds = ((y + TPB_Y) < edges.rows);
241 __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
242 __shared__ int edgesTile[TILE_ROWS][TILE_COLS];
244 int new_labels[TPB_Y][TPB_X];
245 int old_labels[TPB_Y][TPB_X];
248 for (int i = 0; i < TPB_Y; ++i)
250 for (int j = 0; j < TPB_X; ++j)
252 int yloc = threadIdx.y + CTA_SIZE_Y * i;
253 int xloc = threadIdx.x + CTA_SIZE_X * j;
254 component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
256 if (!xloc) c &= ~LEFT;
259 if (xloc == TILE_COLS -1) c &= ~RIGHT;
260 if (yloc == TILE_ROWS -1) c &= ~DOWN;
262 new_labels[i][j] = yloc * TILE_COLS + xloc;
263 edgesTile[yloc][xloc] = c;
266 for (int k = 0; ;++k)
270 for (int i = 0; i < TPB_Y; ++i)
272 for (int j = 0; j < TPB_X; ++j)
274 int yloc = threadIdx.y + CTA_SIZE_Y * i;
275 int xloc = threadIdx.x + CTA_SIZE_X * j;
277 old_labels[i][j] = new_labels[i][j];
278 labelsTile[yloc][xloc] = new_labels[i][j];
283 //2. compare local arrays
285 for (int i = 0; i < TPB_Y; ++i)
287 for (int j = 0; j < TPB_X; ++j)
289 int yloc = threadIdx.y + CTA_SIZE_Y * i;
290 int xloc = threadIdx.x + CTA_SIZE_X * j;
292 component c = edgesTile[yloc][xloc];
293 int label = new_labels[i][j];
296 label = ::min(label, labelsTile[yloc - 1][xloc]);
299 label = ::min(label, labelsTile[yloc + 1][xloc]);
302 label = ::min(label, labelsTile[yloc][xloc - 1]);
305 label = ::min(label, labelsTile[yloc][xloc + 1]);
307 new_labels[i][j] = label;
312 //3. determine: Is any value changed?
315 for (int i = 0; i < TPB_Y; ++i)
317 for (int j = 0; j < TPB_X; ++j)
319 if (new_labels[i][j] < old_labels[i][j])
322 Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
326 changed = Emulation::syncthreadsOr(changed);
332 const int *labels = &labelsTile[0][0];
334 for (int i = 0; i < TPB_Y; ++i)
336 for (int j = 0; j < TPB_X; ++j)
338 int label = new_labels[i][j];
340 while( labels[label] < label ) label = labels[label];
342 new_labels[i][j] = label;
348 for (int i = 0; i < TPB_Y; ++i)
350 for (int j = 0; j < TPB_X; ++j)
352 int label = new_labels[i][j];
353 int yloc = label / TILE_COLS;
354 int xloc = label - yloc * TILE_COLS;
356 xloc += blockIdx.x * TILE_COLS;
357 yloc += blockIdx.y * TILE_ROWS;
359 label = yloc * edges.cols + xloc;
361 if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
365 __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
369 int y = label / comps.cols;
370 int x = label - y * comps.cols;
372 int parent = comps(y, x);
374 if (label == parent) break;
381 __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
383 int r1 = root(comps, l1);
384 int r2 = root(comps, l2);
386 if (r1 == r2) return;
388 int mi = ::min(r1, r2);
389 int ma = ::max(r1, r2);
391 int y = ma / comps.cols;
392 int x = ma - y * comps.cols;
394 atomicMin(&comps.ptr(y)[x], mi);
398 __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
399 const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
401 int tid = threadIdx.y * blockDim.x + threadIdx.x;
402 int stride = blockDim.y * blockDim.x;
404 int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
405 int yend = ybegin + tilesNumY * tileSizeY;
407 if (blockIdx.y == gridDim.y - 1)
409 yend -= yIncomplete * tileSizeY;
411 tileSizeY = (edges.rows % tileSizeY);
416 int xbegin = blockIdx.x * tilesNumX * tileSizeX;
417 int xend = xbegin + tilesNumX * tileSizeX;
419 if (blockIdx.x == gridDim.x - 1)
421 if (xIncomplete) yend = ybegin;
422 xend -= xIncomplete * tileSizeX;
424 tileSizeX = (edges.cols % tileSizeX);
429 if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
434 int tasksV = (tilesNumX - 1) * (yend - ybegin);
435 int tasksH = (tilesNumY - 1) * (xend - xbegin);
437 int total = tasksH + tasksV;
443 for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
445 if (taskIdx < tasksH)
447 int indexH = taskIdx;
449 int row = indexH / (xend - xbegin);
450 int col = indexH - row * (xend - xbegin);
452 int y = ybegin + (row + 1) * tileSizeY;
453 int x = xbegin + col;
455 component e = edges( x, y);
459 int lu = comps(y - 1, x);
461 isConnected(comps, lc, lu, changed);
466 int indexV = taskIdx - tasksH;
468 int col = indexV / (yend - ybegin);
469 int row = indexV - col * (yend - ybegin);
471 int x = xbegin + (col + 1) * tileSizeX;
472 int y = ybegin + row;
474 component e = edges(x, y);
477 int lc = comps(y, x);
478 int ll = comps(y, x - 1);
480 isConnected(comps, lc, ll, changed);
484 } while (Emulation::syncthreadsOr(changed));
487 __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
489 int x = threadIdx.x + blockIdx.x * blockDim.x;
490 int y = threadIdx.y + blockIdx.y * blockDim.y;
492 if( x < comps.cols && y < comps.rows)
493 comps(y, x) = root(comps, comps(y, x));
496 enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
498 void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
501 dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
502 dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
504 lableTiles<<<grid, block, 0, stream>>>(edges, comps);
505 cudaSafeCall( cudaGetLastError() );
507 int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
508 while (grid.x > 1 || grid.y > 1)
510 dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
511 dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
513 // std::cout << "merging: " << grid.y << " x " << grid.x << " ---> " << mergeGrid.y << " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
514 crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
519 cudaSafeCall( cudaGetLastError() );
522 grid.x = divUp(edges.cols, block.x);
523 grid.y = divUp(edges.rows, block.y);
524 flatten<<<grid, block, 0, stream>>>(edges, comps);
525 cudaSafeCall( cudaGetLastError() );
528 cudaSafeCall( cudaDeviceSynchronize() );
533 #endif /* CUDA_DISABLER */