modules/gpu/src/cuda/canny.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include <utility>
  46 #include <algorithm>//std::swap
  47 #include "opencv2/gpu/device/common.hpp"
  48 #include "opencv2/gpu/device/emulation.hpp"
  49 #include "opencv2/gpu/device/transform.hpp"
  50 #include "opencv2/gpu/device/functional.hpp"
  51 #include "opencv2/gpu/device/utility.hpp"
  52
  53 using namespace cv::gpu;
  54 using namespace cv::gpu::device;
  55
  56 namespace canny
  57 {
  58     struct L1 : binary_function<int, int, float>
  59     {
  60         __device__ __forceinline__ float operator ()(int x, int y) const
  61         {
  62             return ::abs(x) + ::abs(y);
  63         }
  64
  65         __device__ __forceinline__ L1() {}
  66         __device__ __forceinline__ L1(const L1&) {}
  67     };
  68     struct L2 : binary_function<int, int, float>
  69     {
  70         __device__ __forceinline__ float operator ()(int x, int y) const
  71         {
  72             return ::sqrtf(x * x + y * y);
  73         }
  74
  75         __device__ __forceinline__ L2() {}
  76         __device__ __forceinline__ L2(const L2&) {}
  77     };
  78 }
  79
  80 namespace cv { namespace gpu { namespace device
  81 {
  82     template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
  83     {
  84         enum { smart_shift = 4 };
  85     };
  86     template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
  87     {
  88         enum { smart_shift = 4 };
  89     };
  90 }}}
  91
  92 namespace canny
  93 {
  94     texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
  95     struct SrcTex
  96     {
  97         const int xoff;
  98         const int yoff;
  99         __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
 100
 101         __device__ __forceinline__ int operator ()(int y, int x) const
 102         {
 103             return tex2D(tex_src, x + xoff, y + yoff);
 104         }
 105     };
 106
 107     template <class Norm> __global__
 108     void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
 109     {
 110         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 111         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 112
 113         if (y >= mag.rows || x >= mag.cols)
 114             return;
 115
 116         int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
 117         int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
 118
 119         dx(y, x) = dxVal;
 120         dy(y, x) = dyVal;
 121
 122         mag(y, x) = norm(dxVal, dyVal);
 123     }
 124
 125     void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
 126     {
 127         const dim3 block(16, 16);
 128         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
 129
 130         bindTexture(&tex_src, srcWhole);
 131         SrcTex src(xoff, yoff);
 132
 133         if (L2Grad)
 134         {
 135             L2 norm;
 136             calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
 137         }
 138         else
 139         {
 140             L1 norm;
 141             calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
 142         }
 143
 144         cudaSafeCall( cudaGetLastError() );
 145
 146         cudaSafeCall(cudaThreadSynchronize());
 147     }
 148
 149     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
 150     {
 151         if (L2Grad)
 152         {
 153             L2 norm;
 154             transform(dx, dy, mag, norm, WithOutMask(), 0);
 155         }
 156         else
 157         {
 158             L1 norm;
 159             transform(dx, dy, mag, norm, WithOutMask(), 0);
 160         }
 161     }
 162 }
 163
 164 //////////////////////////////////////////////////////////////////////////////////////////
 165
 166 namespace canny
 167 {
 168     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
 169
 170     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
 171     {
 172         const int CANNY_SHIFT = 15;
 173         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
 174
 175         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 176         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 177
 178         if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
 179             return;
 180
 181         int dxVal = dx(y, x);
 182         int dyVal = dy(y, x);
 183
 184         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
 185         const float m = tex2D(tex_mag, x, y);
 186
 187         dxVal = ::abs(dxVal);
 188         dyVal = ::abs(dyVal);
 189
 190         // 0 - the pixel can not belong to an edge
 191         // 1 - the pixel might belong to an edge
 192         // 2 - the pixel does belong to an edge
 193         int edge_type = 0;
 194
 195         if (m > low_thresh)
 196         {
 197             const int tg22x = dxVal * TG22;
 198             const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
 199
 200             dyVal <<= CANNY_SHIFT;
 201
 202             if (dyVal < tg22x)
 203             {
 204                 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
 205                     edge_type = 1 + (int)(m > high_thresh);
 206             }
 207             else if(dyVal > tg67x)
 208             {
 209                 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
 210                     edge_type = 1 + (int)(m > high_thresh);
 211             }
 212             else
 213             {
 214                 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
 215                     edge_type = 1 + (int)(m > high_thresh);
 216             }
 217         }
 218
 219         map(y, x) = edge_type;
 220     }
 221
 222     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
 223     {
 224         const dim3 block(16, 16);
 225         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
 226
 227         bindTexture(&tex_mag, mag);
 228
 229         calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
 230         cudaSafeCall( cudaGetLastError() );
 231
 232         cudaSafeCall( cudaDeviceSynchronize() );
 233     }
 234 }
 235
 236 //////////////////////////////////////////////////////////////////////////////////////////
 237
 238 namespace canny
 239 {
 240     __device__ int counter = 0;
 241
 242     __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
 243     {
 244         __shared__ volatile int smem[18][18];
 245
 246         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 247         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 248
 249         smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
 250         if (threadIdx.y == 0)
 251             smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
 252         if (threadIdx.y == blockDim.y - 1)
 253             smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
 254         if (threadIdx.x == 0)
 255             smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
 256         if (threadIdx.x == blockDim.x - 1)
 257             smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
 258         if (threadIdx.x == 0 && threadIdx.y == 0)
 259             smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
 260         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
 261             smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
 262         if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
 263             smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
 264         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
 265             smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
 266
 267         __syncthreads();
 268
 269         if (x >= map.cols || y >= map.rows)
 270             return;
 271
 272         int n;
 273
 274         #pragma unroll
 275         for (int k = 0; k < 16; ++k)
 276         {
 277             n = 0;
 278
 279             if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
 280             {
 281                 n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
 282                 n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
 283                 n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
 284
 285                 n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
 286                 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
 287
 288                 n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
 289                 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
 290                 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
 291             }
 292
 293             if (n > 0)
 294                 smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
 295         }
 296
 297         const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
 298
 299         map(y, x) = e;
 300
 301         n = 0;
 302
 303         if (e == 2)
 304         {
 305             n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
 306             n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
 307             n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
 308
 309             n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
 310             n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
 311
 312             n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
 313             n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
 314             n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
 315         }
 316
 317         if (n > 0)
 318         {
 319             const int ind =  ::atomicAdd(&counter, 1);
 320             st[ind] = make_ushort2(x, y);
 321         }
 322     }
 323
 324     void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
 325     {
 326         void* counter_ptr;
 327         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
 328
 329         cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
 330
 331         const dim3 block(16, 16);
 332         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
 333
 334         edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
 335         cudaSafeCall( cudaGetLastError() );
 336
 337         cudaSafeCall( cudaDeviceSynchronize() );
 338     }
 339 }
 340
 341 //////////////////////////////////////////////////////////////////////////////////////////
 342
 343 namespace canny
 344 {
 345     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
 346     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 347
 348     __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
 349     {
 350         const int stack_size = 512;
 351
 352         __shared__ int s_counter;
 353         __shared__ int s_ind;
 354         __shared__ ushort2 s_st[stack_size];
 355
 356         if (threadIdx.x == 0)
 357             s_counter = 0;
 358
 359         __syncthreads();
 360
 361         int ind = blockIdx.y * gridDim.x + blockIdx.x;
 362
 363         if (ind >= count)
 364             return;
 365
 366         ushort2 pos = st1[ind];
 367
 368         if (threadIdx.x < 8)
 369         {
 370             pos.x += c_dx[threadIdx.x];
 371             pos.y += c_dy[threadIdx.x];
 372
 373             if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
 374             {
 375                 map(pos.y, pos.x) = 2;
 376
 377                 ind = Emulation::smem::atomicAdd(&s_counter, 1);
 378
 379                 s_st[ind] = pos;
 380             }
 381         }
 382
 383         __syncthreads();
 384
 385         while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
 386         {
 387             const int subTaskIdx = threadIdx.x >> 3;
 388             const int portion = ::min(s_counter, blockDim.x >> 3);
 389
 390             if (subTaskIdx < portion)
 391                 pos = s_st[s_counter - 1 - subTaskIdx];
 392
 393             __syncthreads();
 394
 395             if (threadIdx.x == 0)
 396                 s_counter -= portion;
 397
 398             __syncthreads();
 399
 400             if (subTaskIdx < portion)
 401             {
 402                 pos.x += c_dx[threadIdx.x & 7];
 403                 pos.y += c_dy[threadIdx.x & 7];
 404
 405                 if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
 406                 {
 407                     map(pos.y, pos.x) = 2;
 408
 409                     ind = Emulation::smem::atomicAdd(&s_counter, 1);
 410
 411                     s_st[ind] = pos;
 412                 }
 413             }
 414
 415             __syncthreads();
 416         }
 417
 418         if (s_counter > 0)
 419         {
 420             if (threadIdx.x == 0)
 421             {
 422                 ind = ::atomicAdd(&counter, s_counter);
 423                 s_ind = ind - s_counter;
 424             }
 425
 426             __syncthreads();
 427
 428             ind = s_ind;
 429
 430             for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
 431                 st2[ind + i] = s_st[i];
 432         }
 433     }
 434
 435     void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
 436     {
 437         void* counter_ptr;
 438         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
 439
 440         int count;
 441         cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
 442
 443         while (count > 0)
 444         {
 445             cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
 446
 447             const dim3 block(128);
 448             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
 449
 450             edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
 451             cudaSafeCall( cudaGetLastError() );
 452
 453             cudaSafeCall( cudaDeviceSynchronize() );
 454
 455             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
 456
 457             std::swap(st1, st2);
 458         }
 459     }
 460 }
 461
 462 //////////////////////////////////////////////////////////////////////////////////////////
 463
 464 namespace canny
 465 {
 466     struct GetEdges : unary_function<int, uchar>
 467     {
 468         __device__ __forceinline__ uchar operator ()(int e) const
 469         {
 470             return (uchar)(-(e >> 1));
 471         }
 472
 473         __device__ __forceinline__ GetEdges() {}
 474         __device__ __forceinline__ GetEdges(const GetEdges&) {}
 475     };
 476 }
 477
 478 namespace cv { namespace gpu { namespace device
 479 {
 480     template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
 481     {
 482         enum { smart_shift = 4 };
 483     };
 484 }}}
 485
 486 namespace canny
 487 {
 488     void getEdges(PtrStepSzi map, PtrStepSzb dst)
 489     {
 490         transform(map, dst, GetEdges(), WithOutMask(), 0);
 491     }
 492 }
 493
 494 #endif /* CUDA_DISABLER */