1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include <thrust/sort.h>
44 #include "opencv2/gpu/device/common.hpp"
45 #include "opencv2/gpu/device/emulation.hpp"
47 namespace cv { namespace gpu { namespace device
51 __device__ int g_counter;
53 ////////////////////////////////////////////////////////////////////////
56 const int PIXELS_PER_THREAD = 16;
58 __global__ void buildPointList(const DevMem2Db src, unsigned int* list)
60 __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
61 __shared__ int s_qsize[4];
62 __shared__ int s_globStart[4];
64 const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
65 const int y = blockIdx.y * blockDim.y + threadIdx.y;
71 s_qsize[threadIdx.y] = 0;
76 const uchar* srcRow = src.ptr(y);
77 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
81 const unsigned int val = (y << 16) | xx;
82 const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
83 s_queues[threadIdx.y][qidx] = val;
89 // let one thread reserve the space required in the global list
90 if (threadIdx.x == 0 && threadIdx.y == 0)
92 // find how many items are stored in each list
94 for (int i = 0; i < blockDim.y; ++i)
96 s_globStart[i] = totalSize;
97 totalSize += s_qsize[i];
100 // calculate the offset in the global list
101 const int globalOffset = atomicAdd(&g_counter, totalSize);
102 for (int i = 0; i < blockDim.y; ++i)
103 s_globStart[i] += globalOffset;
108 // copy local queues to global queue
109 const int qsize = s_qsize[threadIdx.y];
110 int gidx = s_globStart[threadIdx.y] + threadIdx.x;
111 for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
112 list[gidx] = s_queues[threadIdx.y][i];
115 int buildPointList_gpu(DevMem2Db src, unsigned int* list)
118 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
120 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
122 const dim3 block(32, 4);
123 const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
125 cudaSafeCall( cudaFuncSetCacheConfig(buildPointList, cudaFuncCachePreferShared) );
127 buildPointList<<<grid, block>>>(src, list);
128 cudaSafeCall( cudaGetLastError() );
130 cudaSafeCall( cudaDeviceSynchronize() );
133 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
138 ////////////////////////////////////////////////////////////////////////
141 __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
143 const int n = blockIdx.x;
144 const float ang = n * theta;
148 sincosf(ang, &sinVal, &cosVal);
152 const int shift = (numrho - 1) / 2;
154 int* accumRow = accum.ptr(n + 1);
155 for (int i = threadIdx.x; i < count; i += blockDim.x)
157 const unsigned int val = list[i];
159 const int x = (val & 0xFFFF);
160 const int y = (val >> 16) & 0xFFFF;
162 int r = __float2int_rn(x * cosVal + y * sinVal);
165 ::atomicAdd(accumRow + r + 1, 1);
169 __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
171 extern __shared__ int smem[];
173 for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
178 const int n = blockIdx.x;
179 const float ang = n * theta;
183 sincosf(ang, &sinVal, &cosVal);
187 const int shift = (numrho - 1) / 2;
189 for (int i = threadIdx.x; i < count; i += blockDim.x)
191 const unsigned int val = list[i];
193 const int x = (val & 0xFFFF);
194 const int y = (val >> 16) & 0xFFFF;
196 int r = __float2int_rn(x * cosVal + y * sinVal);
199 Emulation::smem::atomicAdd(&smem[r + 1], 1);
204 int* accumRow = accum.ptr(n + 1);
205 for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
206 accumRow[i] = smem[i];
209 void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
211 const dim3 block(has20 ? 1024 : 512);
212 const dim3 grid(accum.rows - 2);
214 size_t smemSize = (accum.cols - 1) * sizeof(int);
216 if (smemSize < sharedMemPerBlock - 1000)
217 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
219 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
221 cudaSafeCall( cudaGetLastError() );
223 cudaSafeCall( cudaDeviceSynchronize() );
226 ////////////////////////////////////////////////////////////////////////
229 __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
231 const int r = blockIdx.x * blockDim.x + threadIdx.x;
232 const int n = blockIdx.y * blockDim.y + threadIdx.y;
234 if (r >= accum.cols - 2 && n >= accum.rows - 2)
237 const int curVotes = accum(n + 1, r + 1);
239 if (curVotes > threshold &&
240 curVotes > accum(n + 1, r) &&
241 curVotes >= accum(n + 1, r + 2) &&
242 curVotes > accum(n, r + 1) &&
243 curVotes >= accum(n + 2, r + 1))
245 const float radius = (r - (numrho - 1) * 0.5f) * rho;
246 const float angle = n * theta;
248 const int ind = ::atomicAdd(&g_counter, 1);
251 out[ind] = make_float2(radius, angle);
252 votes[ind] = curVotes;
257 int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
260 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
262 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
264 const dim3 block(32, 8);
265 const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
267 cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
269 linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
270 cudaSafeCall( cudaGetLastError() );
272 cudaSafeCall( cudaDeviceSynchronize() );
275 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
277 totalCount = ::min(totalCount, maxSize);
279 if (doSort && totalCount > 0)
281 thrust::device_ptr<float2> outPtr(out);
282 thrust::device_ptr<int> votesPtr(votes);
283 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
289 ////////////////////////////////////////////////////////////////////////
290 // circlesAccumCenters
292 __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
293 PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
295 const int SHIFT = 10;
296 const int ONE = 1 << SHIFT;
298 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
303 const unsigned int val = list[tid];
305 const int x = (val & 0xFFFF);
306 const int y = (val >> 16) & 0xFFFF;
308 const int vx = dx(y, x);
309 const int vy = dy(y, x);
311 if (vx == 0 && vy == 0)
314 const float mag = ::sqrtf(vx * vx + vy * vy);
316 const int x0 = __float2int_rn((x * idp) * ONE);
317 const int y0 = __float2int_rn((y * idp) * ONE);
319 int sx = __float2int_rn((vx * idp) * ONE / mag);
320 int sy = __float2int_rn((vy * idp) * ONE / mag);
322 // Step from minRadius to maxRadius in both directions of the gradient
323 for (int k1 = 0; k1 < 2; ++k1)
325 int x1 = x0 + minRadius * sx;
326 int y1 = y0 + minRadius * sy;
328 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
330 const int x2 = x1 >> SHIFT;
331 const int y2 = y1 >> SHIFT;
333 if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
336 ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
344 void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, DevMem2Di accum, int minRadius, int maxRadius, float idp)
346 const dim3 block(256);
347 const dim3 grid(divUp(count, block.x));
349 cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
351 circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
352 cudaSafeCall( cudaGetLastError() );
354 cudaSafeCall( cudaDeviceSynchronize() );
357 ////////////////////////////////////////////////////////////////////////
360 __global__ void buildCentersList(const DevMem2Di accum, unsigned int* centers, const int threshold)
362 const int x = blockIdx.x * blockDim.x + threadIdx.x;
363 const int y = blockIdx.y * blockDim.y + threadIdx.y;
365 if (x < accum.cols - 2 && y < accum.rows - 2)
367 const int top = accum(y, x + 1);
369 const int left = accum(y + 1, x);
370 const int cur = accum(y + 1, x + 1);
371 const int right = accum(y + 1, x + 2);
373 const int bottom = accum(y + 2, x + 1);
375 if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
377 const unsigned int val = (y << 16) | x;
378 const int idx = ::atomicAdd(&g_counter, 1);
384 int buildCentersList_gpu(DevMem2Di accum, unsigned int* centers, int threshold)
387 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
389 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
391 const dim3 block(32, 8);
392 const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
394 cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
396 buildCentersList<<<grid, block>>>(accum, centers, threshold);
397 cudaSafeCall( cudaGetLastError() );
399 cudaSafeCall( cudaDeviceSynchronize() );
402 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
407 ////////////////////////////////////////////////////////////////////////
408 // circlesAccumRadius
410 __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
411 float3* circles, const int maxCircles, const float dp,
412 const int minRadius, const int maxRadius, const int histSize, const int threshold)
414 extern __shared__ int smem[];
416 for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
420 unsigned int val = centers[blockIdx.x];
422 float cx = (val & 0xFFFF);
423 float cy = (val >> 16) & 0xFFFF;
425 cx = (cx + 0.5f) * dp;
426 cy = (cy + 0.5f) * dp;
428 for (int i = threadIdx.x; i < count; i += blockDim.x)
432 const int x = (val & 0xFFFF);
433 const int y = (val >> 16) & 0xFFFF;
435 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
436 if (rad >= minRadius && rad <= maxRadius)
438 const int r = __float2int_rn(rad - minRadius);
440 Emulation::smem::atomicAdd(&smem[r + 1], 1);
446 for (int i = threadIdx.x; i < histSize; i += blockDim.x)
448 const int curVotes = smem[i + 1];
450 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
452 const int ind = ::atomicAdd(&g_counter, 1);
453 if (ind < maxCircles)
454 circles[ind] = make_float3(cx, cy, i + minRadius);
459 int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
460 float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
463 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
465 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
467 const dim3 block(has20 ? 1024 : 512);
468 const dim3 grid(centersCount);
470 const int histSize = ::ceil(maxRadius - minRadius + 1);
471 size_t smemSize = (histSize + 2) * sizeof(int);
473 circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
474 cudaSafeCall( cudaGetLastError() );
476 cudaSafeCall( cudaDeviceSynchronize() );
479 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
481 totalCount = ::min(totalCount, maxCircles);