modules/gpu/src/cuda/ccomponetns.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 // By downloading, copying, installing or using the software you agree to this license.
   6 // If you do not agree to this license, do not download, install,
   7 // copy or use the software.
   8 //
   9 //
  10 //                          License Agreement
  11 //               For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //  * Redistributions of source code must retain the above copyright notice,
  21 //    this list of conditions and the following disclaimer.
  22 //
  23 //  * Redistributions in binary form must reproduce the above copyright notice,
  24 //    this list of conditions and the following disclaimer in the documentation
  25 //    and/or other materials provided with the distribution.
  26 //
  27 //  * The name of the copyright holders may not be used to endorse or promote products
  28 //    derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //M*/
  41
  42 #if !defined CUDA_DISABLER
  43
  44 #include <opencv2/gpu/device/common.hpp>
  45 #include <opencv2/gpu/device/vec_traits.hpp>
  46 #include <opencv2/gpu/device/vec_math.hpp>
  47 #include <opencv2/gpu/device/emulation.hpp>
  48
  49 #include <iostream>
  50 #include <stdio.h>
  51
  52 namespace cv { namespace gpu { namespace device
  53 {
  54     namespace ccl
  55     {
  56         enum
  57         {
  58             WARP_SIZE  = 32,
  59             WARP_LOG   = 5,
  60
  61             CTA_SIZE_X = 32,
  62             CTA_SIZE_Y = 8,
  63
  64             STA_SIZE_MERGE_Y = 4,
  65             STA_SIZE_MERGE_X = 32,
  66
  67             TPB_X = 1,
  68             TPB_Y = 4,
  69
  70             TILE_COLS = CTA_SIZE_X * TPB_X,
  71             TILE_ROWS = CTA_SIZE_Y * TPB_Y
  72         };
  73
  74         template<typename T> struct IntervalsTraits
  75         {
  76             typedef T elem_type;
  77         };
  78
  79         template<> struct IntervalsTraits<unsigned char>
  80         {
  81             typedef int dist_type;
  82             enum {ch = 1};
  83         };
  84
  85         template<> struct IntervalsTraits<uchar3>
  86         {
  87             typedef int3 dist_type;
  88             enum {ch = 3};
  89         };
  90
  91         template<> struct IntervalsTraits<uchar4>
  92         {
  93             typedef int4 dist_type;
  94             enum {ch = 4};
  95         };
  96
  97         template<> struct IntervalsTraits<unsigned short>
  98         {
  99             typedef int dist_type;
 100             enum {ch = 1};
 101         };
 102
 103         template<> struct IntervalsTraits<ushort3>
 104         {
 105             typedef int3 dist_type;
 106             enum {ch = 3};
 107         };
 108
 109         template<> struct IntervalsTraits<ushort4>
 110         {
 111             typedef int4 dist_type;
 112             enum {ch = 4};
 113         };
 114
 115         template<> struct IntervalsTraits<float>
 116         {
 117             typedef float dist_type;
 118             enum {ch = 1};
 119         };
 120
 121         template<> struct IntervalsTraits<int>
 122         {
 123             typedef int dist_type;
 124             enum {ch = 1};
 125         };
 126
 127         typedef unsigned char component;
 128         enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
 129
 130         template<typename T, int CH> struct InInterval {};
 131
 132         template<typename T> struct InInterval<T, 1>
 133         {
 134             typedef typename VecTraits<T>::elem_type E;
 135             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
 136             T lo, hi;
 137
 138             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
 139             {
 140                 I d = a - b;
 141                 return lo <= d && d <= hi;
 142             }
 143         };
 144
 145
 146         template<typename T> struct InInterval<T, 3>
 147         {
 148             typedef typename VecTraits<T>::elem_type E;
 149             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
 150             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
 151             T lo, hi;
 152
 153             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
 154             {
 155                 I d = a - b;
 156                 return lo.x <= d.x && d.x <= hi.x &&
 157                        lo.y <= d.y && d.y <= hi.y &&
 158                        lo.z <= d.z && d.z <= hi.z;
 159             }
 160         };
 161
 162         template<typename T> struct InInterval<T, 4>
 163         {
 164             typedef typename VecTraits<T>::elem_type E;
 165             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
 166             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
 167             T lo, hi;
 168
 169             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
 170             {
 171                 I d = a - b;
 172                 return lo.x <= d.x && d.x <= hi.x &&
 173                        lo.y <= d.y && d.y <= hi.y &&
 174                        lo.z <= d.z && d.z <= hi.z &&
 175                        lo.w <= d.w && d.w <= hi.w;
 176             }
 177         };
 178
 179
 180         template<typename T, typename F>
 181         __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
 182         {
 183             int x = threadIdx.x + blockIdx.x * blockDim.x;
 184             int y = threadIdx.y + blockIdx.y * blockDim.y;
 185
 186             if (x >= image.cols || y >= image.rows) return;
 187
 188             T intensity = image(y, x);
 189             component c = 0;
 190
 191             if ( x > 0 && connected(intensity, image(y, x - 1)))
 192                 c |= LEFT;
 193
 194             if ( y > 0 && connected(intensity, image(y - 1, x)))
 195                 c |= UP;
 196
 197             if ( x - 1 < image.cols && connected(intensity, image(y, x + 1)))
 198                 c |= RIGHT;
 199
 200             if ( y - 1 < image.rows && connected(intensity, image(y + 1, x)))
 201                 c |= DOWN;
 202
 203             components(y, x) = c;
 204         }
 205
 206         template< typename T>
 207         void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
 208         {
 209             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
 210             dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
 211
 212             typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
 213
 214             Int_t inInt(lo, hi);
 215             computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
 216
 217             cudaSafeCall( cudaGetLastError() );
 218             if (stream == 0)
 219                 cudaSafeCall( cudaDeviceSynchronize() );
 220         }
 221
 222         template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 223         template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 224         template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 225         template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 226         template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 227         template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 228         template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 229         template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
 230
 231         __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
 232         {
 233             int x = threadIdx.x + blockIdx.x * TILE_COLS;
 234             int y = threadIdx.y + blockIdx.y * TILE_ROWS;
 235
 236             if (x >= edges.cols || y >= edges.rows) return;
 237
 238             //currently x is 1
 239             int bounds = ((y + TPB_Y) < edges.rows);
 240
 241             __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
 242             __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
 243
 244             int new_labels[TPB_Y][TPB_X];
 245             int old_labels[TPB_Y][TPB_X];
 246
 247             #pragma unroll
 248             for (int i = 0; i < TPB_Y; ++i)
 249                 #pragma unroll
 250                 for (int j = 0; j < TPB_X; ++j)
 251                 {
 252                     int yloc = threadIdx.y + CTA_SIZE_Y * i;
 253                     int xloc = threadIdx.x + CTA_SIZE_X * j;
 254                     component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
 255
 256                     if (!xloc) c &= ~LEFT;
 257                     if (!yloc) c &= ~UP;
 258
 259                     if (xloc == TILE_COLS -1) c &= ~RIGHT;
 260                     if (yloc == TILE_ROWS -1) c &= ~DOWN;
 261
 262                     new_labels[i][j] = yloc * TILE_COLS + xloc;
 263                     edgesTile[yloc][xloc] = c;
 264                 }
 265
 266             for (int k = 0; ;++k)
 267             {
 268                 //1. backup
 269                 #pragma unroll
 270                 for (int i = 0; i < TPB_Y; ++i)
 271                     #pragma unroll
 272                     for (int j = 0; j < TPB_X; ++j)
 273                     {
 274                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
 275                         int xloc = threadIdx.x + CTA_SIZE_X * j;
 276
 277                         old_labels[i][j]       = new_labels[i][j];
 278                         labelsTile[yloc][xloc] = new_labels[i][j];
 279                     }
 280
 281                 __syncthreads();
 282
 283                 //2. compare local arrays
 284                 #pragma unroll
 285                 for (int i = 0; i < TPB_Y; ++i)
 286                     #pragma unroll
 287                     for (int j = 0; j < TPB_X; ++j)
 288                     {
 289                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
 290                         int xloc = threadIdx.x + CTA_SIZE_X * j;
 291
 292                         component c = edgesTile[yloc][xloc];
 293                         int label = new_labels[i][j];
 294
 295                         if (c & UP)
 296                            label = ::min(label, labelsTile[yloc - 1][xloc]);
 297
 298                         if (c &  DOWN)
 299                            label = ::min(label, labelsTile[yloc + 1][xloc]);
 300
 301                         if (c & LEFT)
 302                            label = ::min(label, labelsTile[yloc][xloc - 1]);
 303
 304                         if (c & RIGHT)
 305                            label = ::min(label, labelsTile[yloc][xloc + 1]);
 306
 307                        new_labels[i][j] = label;
 308                     }
 309
 310                 __syncthreads();
 311
 312                 //3. determine: Is any value changed?
 313                 int changed = 0;
 314                 #pragma unroll
 315                 for (int i = 0; i < TPB_Y; ++i)
 316                     #pragma unroll
 317                     for (int j = 0; j < TPB_X; ++j)
 318                     {
 319                         if (new_labels[i][j] < old_labels[i][j])
 320                         {
 321                             changed = 1;
 322                             Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
 323                         }
 324                     }
 325
 326                 changed = Emulation::syncthreadsOr(changed);
 327
 328                 if (!changed)
 329                     break;
 330
 331                 //4. Compact paths
 332                 const int *labels = &labelsTile[0][0];
 333                 #pragma unroll
 334                 for (int i = 0; i < TPB_Y; ++i)
 335                     #pragma unroll
 336                     for (int j = 0; j < TPB_X; ++j)
 337                     {
 338                         int label = new_labels[i][j];
 339
 340                         while( labels[label] < label ) label = labels[label];
 341
 342                         new_labels[i][j] = label;
 343                     }
 344                 __syncthreads();
 345             }
 346
 347             #pragma unroll
 348             for (int i = 0; i < TPB_Y; ++i)
 349             #pragma unroll
 350                 for (int j = 0; j < TPB_X; ++j)
 351                 {
 352                     int label = new_labels[i][j];
 353                     int yloc = label / TILE_COLS;
 354                     int xloc = label - yloc * TILE_COLS;
 355
 356                     xloc += blockIdx.x * TILE_COLS;
 357                     yloc += blockIdx.y * TILE_ROWS;
 358
 359                     label = yloc * edges.cols + xloc;
 360                     // do it for x too.
 361                     if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
 362                 }
 363         }
 364
 365         __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
 366         {
 367             while(1)
 368             {
 369                 int y = label / comps.cols;
 370                 int x = label - y * comps.cols;
 371
 372                 int parent = comps(y, x);
 373
 374                 if (label == parent) break;
 375
 376                 label = parent;
 377             }
 378             return label;
 379         }
 380
 381         __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
 382         {
 383             int r1 = root(comps, l1);
 384             int r2 = root(comps, l2);
 385
 386             if (r1 == r2) return;
 387
 388             int mi = ::min(r1, r2);
 389             int ma = ::max(r1, r2);
 390
 391             int y = ma / comps.cols;
 392             int x = ma - y * comps.cols;
 393
 394             atomicMin(&comps.ptr(y)[x], mi);
 395             changed = true;
 396         }
 397
 398         __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
 399             const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
 400         {
 401             int tid = threadIdx.y * blockDim.x + threadIdx.x;
 402             int stride = blockDim.y * blockDim.x;
 403
 404             int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
 405             int yend   = ybegin + tilesNumY * tileSizeY;
 406
 407             if (blockIdx.y == gridDim.y - 1)
 408             {
 409                 yend -= yIncomplete * tileSizeY;
 410                 yend -= tileSizeY;
 411                 tileSizeY = (edges.rows % tileSizeY);
 412
 413                 yend += tileSizeY;
 414             }
 415
 416             int xbegin = blockIdx.x * tilesNumX * tileSizeX;
 417             int xend   = xbegin + tilesNumX * tileSizeX;
 418
 419             if (blockIdx.x == gridDim.x - 1)
 420             {
 421                 if (xIncomplete) yend = ybegin;
 422                 xend -= xIncomplete * tileSizeX;
 423                 xend -= tileSizeX;
 424                 tileSizeX = (edges.cols % tileSizeX);
 425
 426                 xend += tileSizeX;
 427             }
 428
 429             if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
 430             {
 431                 xend = xbegin;
 432             }
 433
 434             int tasksV = (tilesNumX - 1) * (yend - ybegin);
 435             int tasksH = (tilesNumY - 1) * (xend - xbegin);
 436
 437             int total = tasksH + tasksV;
 438
 439             bool changed;
 440             do
 441             {
 442                 changed = false;
 443                 for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
 444                 {
 445                     if (taskIdx < tasksH)
 446                     {
 447                         int indexH = taskIdx;
 448
 449                         int row = indexH / (xend - xbegin);
 450                         int col = indexH - row * (xend - xbegin);
 451
 452                         int y = ybegin + (row + 1) * tileSizeY;
 453                         int x = xbegin + col;
 454
 455                         component e = edges( x, y);
 456                         if (e & UP)
 457                         {
 458                             int lc = comps(y,x);
 459                             int lu = comps(y - 1, x);
 460
 461                             isConnected(comps, lc, lu, changed);
 462                         }
 463                     }
 464                     else
 465                     {
 466                         int indexV = taskIdx - tasksH;
 467
 468                         int col = indexV / (yend - ybegin);
 469                         int row = indexV - col * (yend - ybegin);
 470
 471                         int x = xbegin + (col + 1) * tileSizeX;
 472                         int y = ybegin + row;
 473
 474                         component e = edges(x, y);
 475                         if (e & LEFT)
 476                         {
 477                             int lc = comps(y, x);
 478                             int ll = comps(y, x - 1);
 479
 480                             isConnected(comps, lc, ll, changed);
 481                         }
 482                     }
 483                 }
 484             } while (Emulation::syncthreadsOr(changed));
 485         }
 486
 487         __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
 488         {
 489             int x = threadIdx.x + blockIdx.x * blockDim.x;
 490             int y = threadIdx.y + blockIdx.y * blockDim.y;
 491
 492             if( x < comps.cols && y < comps.rows)
 493                 comps(y, x) = root(comps, comps(y, x));
 494         }
 495
 496         enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
 497
 498         void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
 499         {
 500             (void) flags;
 501             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
 502             dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
 503
 504             lableTiles<<<grid, block, 0, stream>>>(edges, comps);
 505             cudaSafeCall( cudaGetLastError() );
 506
 507             int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
 508             while (grid.x > 1 || grid.y > 1)
 509             {
 510                 dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
 511                 dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
 512                 // debug log
 513                 // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
 514                 crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
 515                 tileSizeX <<= 1;
 516                 tileSizeY <<= 1;
 517                 grid = mergeGrid;
 518
 519                 cudaSafeCall( cudaGetLastError() );
 520             }
 521
 522             grid.x = divUp(edges.cols, block.x);
 523             grid.y = divUp(edges.rows, block.y);
 524             flatten<<<grid, block, 0, stream>>>(edges, comps);
 525             cudaSafeCall( cudaGetLastError() );
 526
 527             if (stream == 0)
 528                 cudaSafeCall( cudaDeviceSynchronize() );
 529         }
 530     }
 531 } } }
 532
 533 #endif /* CUDA_DISABLER */