1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
46 #include <algorithm>//std::swap
47 #include "opencv2/gpu/device/common.hpp"
48 #include "opencv2/gpu/device/emulation.hpp"
49 #include "opencv2/gpu/device/transform.hpp"
50 #include "opencv2/gpu/device/functional.hpp"
51 #include "opencv2/gpu/device/utility.hpp"
53 using namespace cv::gpu;
54 using namespace cv::gpu::device;
58 struct L1 : binary_function<int, int, float>
60 __device__ __forceinline__ float operator ()(int x, int y) const
62 return ::abs(x) + ::abs(y);
65 __device__ __forceinline__ L1() {}
66 __device__ __forceinline__ L1(const L1&) {}
68 struct L2 : binary_function<int, int, float>
70 __device__ __forceinline__ float operator ()(int x, int y) const
72 return ::sqrtf(x * x + y * y);
75 __device__ __forceinline__ L2() {}
76 __device__ __forceinline__ L2(const L2&) {}
80 namespace cv { namespace gpu { namespace device
82 template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
84 enum { smart_shift = 4 };
86 template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
88 enum { smart_shift = 4 };
94 texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
99 __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
101 __device__ __forceinline__ int operator ()(int y, int x) const
103 return tex2D(tex_src, x + xoff, y + yoff);
107 template <class Norm> __global__
108 void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
110 const int x = blockIdx.x * blockDim.x + threadIdx.x;
111 const int y = blockIdx.y * blockDim.y + threadIdx.y;
113 if (y >= mag.rows || x >= mag.cols)
116 int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
117 int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
122 mag(y, x) = norm(dxVal, dyVal);
125 void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
127 const dim3 block(16, 16);
128 const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
130 bindTexture(&tex_src, srcWhole);
131 SrcTex src(xoff, yoff);
136 calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
141 calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
144 cudaSafeCall( cudaGetLastError() );
146 cudaSafeCall(cudaThreadSynchronize());
149 void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
154 transform(dx, dy, mag, norm, WithOutMask(), 0);
159 transform(dx, dy, mag, norm, WithOutMask(), 0);
164 //////////////////////////////////////////////////////////////////////////////////////////
168 texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
170 __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
172 const int CANNY_SHIFT = 15;
173 const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
175 const int x = blockIdx.x * blockDim.x + threadIdx.x;
176 const int y = blockIdx.y * blockDim.y + threadIdx.y;
178 if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
181 int dxVal = dx(y, x);
182 int dyVal = dy(y, x);
184 const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
185 const float m = tex2D(tex_mag, x, y);
187 dxVal = ::abs(dxVal);
188 dyVal = ::abs(dyVal);
190 // 0 - the pixel can not belong to an edge
191 // 1 - the pixel might belong to an edge
192 // 2 - the pixel does belong to an edge
197 const int tg22x = dxVal * TG22;
198 const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
200 dyVal <<= CANNY_SHIFT;
204 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
205 edge_type = 1 + (int)(m > high_thresh);
207 else if(dyVal > tg67x)
209 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
210 edge_type = 1 + (int)(m > high_thresh);
214 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
215 edge_type = 1 + (int)(m > high_thresh);
219 map(y, x) = edge_type;
222 void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
224 const dim3 block(16, 16);
225 const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
227 bindTexture(&tex_mag, mag);
229 calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
230 cudaSafeCall( cudaGetLastError() );
232 cudaSafeCall( cudaDeviceSynchronize() );
236 //////////////////////////////////////////////////////////////////////////////////////////
240 __device__ int counter = 0;
242 __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
244 __shared__ volatile int smem[18][18];
246 const int x = blockIdx.x * blockDim.x + threadIdx.x;
247 const int y = blockIdx.y * blockDim.y + threadIdx.y;
249 smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
250 if (threadIdx.y == 0)
251 smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
252 if (threadIdx.y == blockDim.y - 1)
253 smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
254 if (threadIdx.x == 0)
255 smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
256 if (threadIdx.x == blockDim.x - 1)
257 smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
258 if (threadIdx.x == 0 && threadIdx.y == 0)
259 smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
260 if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
261 smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
262 if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
263 smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
264 if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
265 smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
269 if (x >= map.cols || y >= map.rows)
275 for (int k = 0; k < 16; ++k)
279 if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
281 n += smem[threadIdx.y ][threadIdx.x ] == 2;
282 n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
283 n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
285 n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
286 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
288 n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
289 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
290 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
294 smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
297 const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
305 n += smem[threadIdx.y ][threadIdx.x ] == 1;
306 n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
307 n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
309 n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
310 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
312 n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
313 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
314 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
319 const int ind = ::atomicAdd(&counter, 1);
320 st[ind] = make_ushort2(x, y);
324 void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
327 cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
329 cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
331 const dim3 block(16, 16);
332 const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
334 edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
335 cudaSafeCall( cudaGetLastError() );
337 cudaSafeCall( cudaDeviceSynchronize() );
341 //////////////////////////////////////////////////////////////////////////////////////////
345 __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
346 __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
348 __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
350 const int stack_size = 512;
352 __shared__ int s_counter;
353 __shared__ int s_ind;
354 __shared__ ushort2 s_st[stack_size];
356 if (threadIdx.x == 0)
361 int ind = blockIdx.y * gridDim.x + blockIdx.x;
366 ushort2 pos = st1[ind];
370 pos.x += c_dx[threadIdx.x];
371 pos.y += c_dy[threadIdx.x];
373 if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
375 map(pos.y, pos.x) = 2;
377 ind = Emulation::smem::atomicAdd(&s_counter, 1);
385 while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
387 const int subTaskIdx = threadIdx.x >> 3;
388 const int portion = ::min(s_counter, blockDim.x >> 3);
390 if (subTaskIdx < portion)
391 pos = s_st[s_counter - 1 - subTaskIdx];
395 if (threadIdx.x == 0)
396 s_counter -= portion;
400 if (subTaskIdx < portion)
402 pos.x += c_dx[threadIdx.x & 7];
403 pos.y += c_dy[threadIdx.x & 7];
405 if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
407 map(pos.y, pos.x) = 2;
409 ind = Emulation::smem::atomicAdd(&s_counter, 1);
420 if (threadIdx.x == 0)
422 ind = ::atomicAdd(&counter, s_counter);
423 s_ind = ind - s_counter;
430 for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
431 st2[ind + i] = s_st[i];
435 void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
438 cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
441 cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
445 cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
447 const dim3 block(128);
448 const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
450 edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
451 cudaSafeCall( cudaGetLastError() );
453 cudaSafeCall( cudaDeviceSynchronize() );
455 cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
462 //////////////////////////////////////////////////////////////////////////////////////////
466 struct GetEdges : unary_function<int, uchar>
468 __device__ __forceinline__ uchar operator ()(int e) const
470 return (uchar)(-(e >> 1));
473 __device__ __forceinline__ GetEdges() {}
474 __device__ __forceinline__ GetEdges(const GetEdges&) {}
478 namespace cv { namespace gpu { namespace device
480 template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
482 enum { smart_shift = 4 };
488 void getEdges(PtrStepSzi map, PtrStepSzb dst)
490 transform(map, dst, GetEdges(), WithOutMask(), 0);
494 #endif /* CUDA_DISABLER */