1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include <thrust/sort.h>
44 #include "opencv2/gpu/device/common.hpp"
45 #include "opencv2/gpu/device/emulation.hpp"
46 #include "opencv2/gpu/device/vec_math.hpp"
47 #include "opencv2/gpu/device/limits.hpp"
48 #include "opencv2/gpu/device/dynamic_smem.hpp"
50 namespace cv { namespace gpu { namespace device
54 __device__ int g_counter;
56 ////////////////////////////////////////////////////////////////////////
59 template <int PIXELS_PER_THREAD>
60 __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
62 __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
63 __shared__ int s_qsize[4];
64 __shared__ int s_globStart[4];
66 const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
67 const int y = blockIdx.y * blockDim.y + threadIdx.y;
70 s_qsize[threadIdx.y] = 0;
76 const uchar* srcRow = src.ptr(y);
77 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
81 const unsigned int val = (y << 16) | xx;
82 const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
83 s_queues[threadIdx.y][qidx] = val;
90 // let one thread reserve the space required in the global list
91 if (threadIdx.x == 0 && threadIdx.y == 0)
93 // find how many items are stored in each list
95 for (int i = 0; i < blockDim.y; ++i)
97 s_globStart[i] = totalSize;
98 totalSize += s_qsize[i];
101 // calculate the offset in the global list
102 const int globalOffset = atomicAdd(&g_counter, totalSize);
103 for (int i = 0; i < blockDim.y; ++i)
104 s_globStart[i] += globalOffset;
109 // copy local queues to global queue
110 const int qsize = s_qsize[threadIdx.y];
111 int gidx = s_globStart[threadIdx.y] + threadIdx.x;
112 for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
113 list[gidx] = s_queues[threadIdx.y][i];
116 int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
118 const int PIXELS_PER_THREAD = 16;
121 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
123 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
125 const dim3 block(32, 4);
126 const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
128 cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
130 buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
131 cudaSafeCall( cudaGetLastError() );
133 cudaSafeCall( cudaDeviceSynchronize() );
136 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
141 ////////////////////////////////////////////////////////////////////////
144 __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
146 const int n = blockIdx.x;
147 const float ang = n * theta;
151 sincosf(ang, &sinVal, &cosVal);
155 const int shift = (numrho - 1) / 2;
157 int* accumRow = accum.ptr(n + 1);
158 for (int i = threadIdx.x; i < count; i += blockDim.x)
160 const unsigned int val = list[i];
162 const int x = (val & 0xFFFF);
163 const int y = (val >> 16) & 0xFFFF;
165 int r = __float2int_rn(x * cosVal + y * sinVal);
168 ::atomicAdd(accumRow + r + 1, 1);
172 __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
174 int* smem = DynamicSharedMem<int>();
176 for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
181 const int n = blockIdx.x;
182 const float ang = n * theta;
186 sincosf(ang, &sinVal, &cosVal);
190 const int shift = (numrho - 1) / 2;
192 for (int i = threadIdx.x; i < count; i += blockDim.x)
194 const unsigned int val = list[i];
196 const int x = (val & 0xFFFF);
197 const int y = (val >> 16) & 0xFFFF;
199 int r = __float2int_rn(x * cosVal + y * sinVal);
202 Emulation::smem::atomicAdd(&smem[r + 1], 1);
207 int* accumRow = accum.ptr(n + 1);
208 for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
209 accumRow[i] = smem[i];
212 void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
214 const dim3 block(has20 ? 1024 : 512);
215 const dim3 grid(accum.rows - 2);
217 size_t smemSize = (accum.cols - 1) * sizeof(int);
219 if (smemSize < sharedMemPerBlock - 1000)
220 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
222 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
224 cudaSafeCall( cudaGetLastError() );
226 cudaSafeCall( cudaDeviceSynchronize() );
229 ////////////////////////////////////////////////////////////////////////
232 __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
234 const int r = blockIdx.x * blockDim.x + threadIdx.x;
235 const int n = blockIdx.y * blockDim.y + threadIdx.y;
237 if (r >= accum.cols - 2 && n >= accum.rows - 2)
240 const int curVotes = accum(n + 1, r + 1);
242 if (curVotes > threshold &&
243 curVotes > accum(n + 1, r) &&
244 curVotes >= accum(n + 1, r + 2) &&
245 curVotes > accum(n, r + 1) &&
246 curVotes >= accum(n + 2, r + 1))
248 const float radius = (r - (numrho - 1) * 0.5f) * rho;
249 const float angle = n * theta;
251 const int ind = ::atomicAdd(&g_counter, 1);
254 out[ind] = make_float2(radius, angle);
255 votes[ind] = curVotes;
260 int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
263 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
265 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
267 const dim3 block(32, 8);
268 const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
270 cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
272 linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
273 cudaSafeCall( cudaGetLastError() );
275 cudaSafeCall( cudaDeviceSynchronize() );
278 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
280 totalCount = ::min(totalCount, maxSize);
282 if (doSort && totalCount > 0)
284 thrust::device_ptr<float2> outPtr(out);
285 thrust::device_ptr<int> votesPtr(votes);
286 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
292 ////////////////////////////////////////////////////////////////////////
293 // circlesAccumCenters
295 __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
296 PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
298 const int SHIFT = 10;
299 const int ONE = 1 << SHIFT;
301 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
306 const unsigned int val = list[tid];
308 const int x = (val & 0xFFFF);
309 const int y = (val >> 16) & 0xFFFF;
311 const int vx = dx(y, x);
312 const int vy = dy(y, x);
314 if (vx == 0 && vy == 0)
317 const float mag = ::sqrtf(vx * vx + vy * vy);
319 const int x0 = __float2int_rn((x * idp) * ONE);
320 const int y0 = __float2int_rn((y * idp) * ONE);
322 int sx = __float2int_rn((vx * idp) * ONE / mag);
323 int sy = __float2int_rn((vy * idp) * ONE / mag);
325 // Step from minRadius to maxRadius in both directions of the gradient
326 for (int k1 = 0; k1 < 2; ++k1)
328 int x1 = x0 + minRadius * sx;
329 int y1 = y0 + minRadius * sy;
331 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
333 const int x2 = x1 >> SHIFT;
334 const int y2 = y1 >> SHIFT;
336 if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
339 ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
347 void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
349 const dim3 block(256);
350 const dim3 grid(divUp(count, block.x));
352 cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
354 circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
355 cudaSafeCall( cudaGetLastError() );
357 cudaSafeCall( cudaDeviceSynchronize() );
360 ////////////////////////////////////////////////////////////////////////
363 __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
365 const int x = blockIdx.x * blockDim.x + threadIdx.x;
366 const int y = blockIdx.y * blockDim.y + threadIdx.y;
368 if (x < accum.cols - 2 && y < accum.rows - 2)
370 const int top = accum(y, x + 1);
372 const int left = accum(y + 1, x);
373 const int cur = accum(y + 1, x + 1);
374 const int right = accum(y + 1, x + 2);
376 const int bottom = accum(y + 2, x + 1);
378 if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
380 const unsigned int val = (y << 16) | x;
381 const int idx = ::atomicAdd(&g_counter, 1);
387 int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
390 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
392 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
394 const dim3 block(32, 8);
395 const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
397 cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
399 buildCentersList<<<grid, block>>>(accum, centers, threshold);
400 cudaSafeCall( cudaGetLastError() );
402 cudaSafeCall( cudaDeviceSynchronize() );
405 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
410 ////////////////////////////////////////////////////////////////////////
411 // circlesAccumRadius
413 __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
414 float3* circles, const int maxCircles, const float dp,
415 const int minRadius, const int maxRadius, const int histSize, const int threshold)
417 int* smem = DynamicSharedMem<int>();
419 for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
423 unsigned int val = centers[blockIdx.x];
425 float cx = (val & 0xFFFF);
426 float cy = (val >> 16) & 0xFFFF;
428 cx = (cx + 0.5f) * dp;
429 cy = (cy + 0.5f) * dp;
431 for (int i = threadIdx.x; i < count; i += blockDim.x)
435 const int x = (val & 0xFFFF);
436 const int y = (val >> 16) & 0xFFFF;
438 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
439 if (rad >= minRadius && rad <= maxRadius)
441 const int r = __float2int_rn(rad - minRadius);
443 Emulation::smem::atomicAdd(&smem[r + 1], 1);
449 for (int i = threadIdx.x; i < histSize; i += blockDim.x)
451 const int curVotes = smem[i + 1];
453 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
455 const int ind = ::atomicAdd(&g_counter, 1);
456 if (ind < maxCircles)
457 circles[ind] = make_float3(cx, cy, i + minRadius);
462 int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
463 float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
466 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
468 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
470 const dim3 block(has20 ? 1024 : 512);
471 const dim3 grid(centersCount);
473 const int histSize = maxRadius - minRadius + 1;
474 size_t smemSize = (histSize + 2) * sizeof(int);
476 circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
477 cudaSafeCall( cudaGetLastError() );
479 cudaSafeCall( cudaDeviceSynchronize() );
482 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
484 totalCount = ::min(totalCount, maxCircles);
489 ////////////////////////////////////////////////////////////////////////
492 template <typename T, int PIXELS_PER_THREAD>
493 __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
495 __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
496 __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
497 __shared__ int s_sizes[4];
498 __shared__ int s_globStart[4];
500 const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
501 const int y = blockIdx.y * blockDim.y + threadIdx.y;
503 if (threadIdx.x == 0)
504 s_sizes[threadIdx.y] = 0;
510 const uchar* edgesRow = edges.ptr(y);
511 const T* dxRow = dx.ptr(y);
512 const T* dyRow = dy.ptr(y);
514 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
516 const T dxVal = dxRow[xx];
517 const T dyVal = dyRow[xx];
519 if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
521 const unsigned int coord = (y << 16) | xx;
523 float theta = ::atan2f(dyVal, dxVal);
525 theta += 2.0f * CV_PI_F;
527 const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
529 s_coordLists[threadIdx.y][qidx] = coord;
530 s_thetaLists[threadIdx.y][qidx] = theta;
537 // let one thread reserve the space required in the global list
538 if (threadIdx.x == 0 && threadIdx.y == 0)
540 // find how many items are stored in each list
542 for (int i = 0; i < blockDim.y; ++i)
544 s_globStart[i] = totalSize;
545 totalSize += s_sizes[i];
548 // calculate the offset in the global list
549 const int globalOffset = atomicAdd(&g_counter, totalSize);
550 for (int i = 0; i < blockDim.y; ++i)
551 s_globStart[i] += globalOffset;
556 // copy local queues to global queue
557 const int qsize = s_sizes[threadIdx.y];
558 int gidx = s_globStart[threadIdx.y] + threadIdx.x;
559 for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
561 coordList[gidx] = s_coordLists[threadIdx.y][i];
562 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
566 template <typename T>
567 int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
569 const int PIXELS_PER_THREAD = 8;
572 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
574 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
576 const dim3 block(32, 4);
577 const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
579 cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
581 buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
582 cudaSafeCall( cudaGetLastError() );
584 cudaSafeCall( cudaDeviceSynchronize() );
587 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
592 template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
593 template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
594 template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
596 __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
597 PtrStep<short2> r_table, int* r_sizes, int maxSize,
598 const short2 templCenter, const float thetaScale)
600 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
602 if (tid >= pointsCount)
605 const unsigned int coord = coordList[tid];
607 p.x = (coord & 0xFFFF);
608 p.y = (coord >> 16) & 0xFFFF;
610 const float theta = thetaList[tid];
611 const int n = __float2int_rn(theta * thetaScale);
613 const int ind = ::atomicAdd(r_sizes + n, 1);
615 r_table(n, ind) = p - templCenter;
618 void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
619 PtrStepSz<short2> r_table, int* r_sizes,
620 short2 templCenter, int levels)
622 const dim3 block(256);
623 const dim3 grid(divUp(pointsCount, block.x));
625 const float thetaScale = levels / (2.0f * CV_PI_F);
627 buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
628 cudaSafeCall( cudaGetLastError() );
630 cudaSafeCall( cudaDeviceSynchronize() );
633 ////////////////////////////////////////////////////////////////////////
636 __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
637 const PtrStep<short2> r_table, const int* r_sizes,
639 const float idp, const float thetaScale)
641 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
643 if (tid >= pointsCount)
646 const unsigned int coord = coordList[tid];
648 p.x = (coord & 0xFFFF);
649 p.y = (coord >> 16) & 0xFFFF;
651 const float theta = thetaList[tid];
652 const int n = __float2int_rn(theta * thetaScale);
654 const short2* r_row = r_table.ptr(n);
655 const int r_row_size = r_sizes[n];
657 for (int j = 0; j < r_row_size; ++j)
659 short2 c = p - r_row[j];
661 c.x = __float2int_rn(c.x * idp);
662 c.y = __float2int_rn(c.y * idp);
664 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
665 ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
669 void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
670 PtrStepSz<short2> r_table, const int* r_sizes,
672 float dp, int levels)
674 const dim3 block(256);
675 const dim3 grid(divUp(pointsCount, block.x));
677 const float idp = 1.0f / dp;
678 const float thetaScale = levels / (2.0f * CV_PI_F);
680 GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
681 cudaSafeCall( cudaGetLastError() );
683 cudaSafeCall( cudaDeviceSynchronize() );
686 __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
688 const int x = blockIdx.x * blockDim.x + threadIdx.x;
689 const int y = blockIdx.y * blockDim.y + threadIdx.y;
691 if (x >= hist.cols - 2 || y >= hist.rows - 2)
694 const int curVotes = hist(y + 1, x + 1);
696 if (curVotes > threshold &&
697 curVotes > hist(y + 1, x) &&
698 curVotes >= hist(y + 1, x + 2) &&
699 curVotes > hist(y, x + 1) &&
700 curVotes >= hist(y + 2, x + 1))
702 const int ind = ::atomicAdd(&g_counter, 1);
706 out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
707 votes[ind] = make_int3(curVotes, 0, 0);
712 int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
715 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
717 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
719 const dim3 block(32, 8);
720 const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
722 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
724 GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
725 cudaSafeCall( cudaGetLastError() );
727 cudaSafeCall( cudaDeviceSynchronize() );
730 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
732 totalCount = ::min(totalCount, maxSize);
737 ////////////////////////////////////////////////////////////////////////
738 // GHT_Ballard_PosScale
740 __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
741 PtrStep<short2> r_table, const int* r_sizes,
742 PtrStepi hist, const int rows, const int cols,
743 const float minScale, const float scaleStep, const int scaleRange,
744 const float idp, const float thetaScale)
746 const unsigned int coord = coordList[blockIdx.x];
748 p.x = (coord & 0xFFFF);
749 p.y = (coord >> 16) & 0xFFFF;
751 const float theta = thetaList[blockIdx.x];
752 const int n = __float2int_rn(theta * thetaScale);
754 const short2* r_row = r_table.ptr(n);
755 const int r_row_size = r_sizes[n];
757 for (int j = 0; j < r_row_size; ++j)
759 const float2 d = saturate_cast<float2>(r_row[j]);
761 for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
763 const float scale = minScale + s * scaleStep;
765 float2 c = p - scale * d;
770 if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
771 ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
776 void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
777 PtrStepSz<short2> r_table, const int* r_sizes,
778 PtrStepi hist, int rows, int cols,
779 float minScale, float scaleStep, int scaleRange,
780 float dp, int levels)
782 const dim3 block(256);
783 const dim3 grid(pointsCount);
785 const float idp = 1.0f / dp;
786 const float thetaScale = levels / (2.0f * CV_PI_F);
788 GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
791 minScale, scaleStep, scaleRange,
793 cudaSafeCall( cudaGetLastError() );
795 cudaSafeCall( cudaDeviceSynchronize() );
798 __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
799 float4* out, int3* votes, const int maxSize,
800 const float minScale, const float scaleStep, const float dp, const int threshold)
802 const int x = blockIdx.x * blockDim.x + threadIdx.x;
803 const int y = blockIdx.y * blockDim.y + threadIdx.y;
805 if (x >= cols || y >= rows)
808 for (int s = 0; s < scaleRange; ++s)
810 const float scale = minScale + s * scaleStep;
812 const int prevScaleIdx = (s) * (rows + 2);
813 const int curScaleIdx = (s + 1) * (rows + 2);
814 const int nextScaleIdx = (s + 2) * (rows + 2);
816 const int curVotes = hist(curScaleIdx + y + 1, x + 1);
818 if (curVotes > threshold &&
819 curVotes > hist(curScaleIdx + y + 1, x) &&
820 curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
821 curVotes > hist(curScaleIdx + y, x + 1) &&
822 curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
823 curVotes > hist(prevScaleIdx + y + 1, x + 1) &&
824 curVotes >= hist(nextScaleIdx + y + 1, x + 1))
826 const int ind = ::atomicAdd(&g_counter, 1);
830 out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
831 votes[ind] = make_int3(curVotes, curVotes, 0);
837 int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
838 float minScale, float scaleStep, float dp, int threshold)
841 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
843 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
845 const dim3 block(32, 8);
846 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
848 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
850 GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
851 cudaSafeCall( cudaGetLastError() );
853 cudaSafeCall( cudaDeviceSynchronize() );
856 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
858 totalCount = ::min(totalCount, maxSize);
863 ////////////////////////////////////////////////////////////////////////
864 // GHT_Ballard_PosRotation
866 __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
867 PtrStep<short2> r_table, const int* r_sizes,
868 PtrStepi hist, const int rows, const int cols,
869 const float minAngle, const float angleStep, const int angleRange,
870 const float idp, const float thetaScale)
872 const unsigned int coord = coordList[blockIdx.x];
874 p.x = (coord & 0xFFFF);
875 p.y = (coord >> 16) & 0xFFFF;
877 const float thetaVal = thetaList[blockIdx.x];
879 for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
881 const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
883 sincosf(angle, &sinA, &cosA);
885 float theta = thetaVal - angle;
887 theta += 2.0f * CV_PI_F;
889 const int n = __float2int_rn(theta * thetaScale);
891 const short2* r_row = r_table.ptr(n);
892 const int r_row_size = r_sizes[n];
894 for (int j = 0; j < r_row_size; ++j)
896 const float2 d = saturate_cast<float2>(r_row[j]);
898 const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
900 float2 c = make_float2(p.x - dr.x, p.y - dr.y);
904 if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
905 ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
910 void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
911 PtrStepSz<short2> r_table, const int* r_sizes,
912 PtrStepi hist, int rows, int cols,
913 float minAngle, float angleStep, int angleRange,
914 float dp, int levels)
916 const dim3 block(256);
917 const dim3 grid(pointsCount);
919 const float idp = 1.0f / dp;
920 const float thetaScale = levels / (2.0f * CV_PI_F);
922 GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
925 minAngle, angleStep, angleRange,
927 cudaSafeCall( cudaGetLastError() );
929 cudaSafeCall( cudaDeviceSynchronize() );
932 __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
933 float4* out, int3* votes, const int maxSize,
934 const float minAngle, const float angleStep, const float dp, const int threshold)
936 const int x = blockIdx.x * blockDim.x + threadIdx.x;
937 const int y = blockIdx.y * blockDim.y + threadIdx.y;
939 if (x >= cols || y >= rows)
942 for (int a = 0; a < angleRange; ++a)
944 const float angle = minAngle + a * angleStep;
946 const int prevAngleIdx = (a) * (rows + 2);
947 const int curAngleIdx = (a + 1) * (rows + 2);
948 const int nextAngleIdx = (a + 2) * (rows + 2);
950 const int curVotes = hist(curAngleIdx + y + 1, x + 1);
952 if (curVotes > threshold &&
953 curVotes > hist(curAngleIdx + y + 1, x) &&
954 curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
955 curVotes > hist(curAngleIdx + y, x + 1) &&
956 curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
957 curVotes > hist(prevAngleIdx + y + 1, x + 1) &&
958 curVotes >= hist(nextAngleIdx + y + 1, x + 1))
960 const int ind = ::atomicAdd(&g_counter, 1);
964 out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
965 votes[ind] = make_int3(curVotes, 0, curVotes);
971 int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
972 float minAngle, float angleStep, float dp, int threshold)
975 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
977 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
979 const dim3 block(32, 8);
980 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
982 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
984 GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
985 cudaSafeCall( cudaGetLastError() );
987 cudaSafeCall( cudaDeviceSynchronize() );
990 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
992 totalCount = ::min(totalCount, maxSize);
997 ////////////////////////////////////////////////////////////////////////
1005 uchar* p1_theta_data;
1006 size_t p1_theta_step;
1021 __constant__ FeatureTable c_templFeatures;
1022 __constant__ FeatureTable c_imageFeatures;
1024 void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1028 tbl.p1_pos_data = p1_pos.data;
1029 tbl.p1_pos_step = p1_pos.step;
1031 tbl.p1_theta_data = p1_theta.data;
1032 tbl.p1_theta_step = p1_theta.step;
1034 tbl.p2_pos_data = p2_pos.data;
1035 tbl.p2_pos_step = p2_pos.step;
1037 tbl.d12_data = d12.data;
1038 tbl.d12_step = d12.step;
1040 tbl.r1_data = r1.data;
1041 tbl.r1_step = r1.step;
1043 tbl.r2_data = r2.data;
1044 tbl.r2_step = r2.step;
1046 cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
1048 void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1052 tbl.p1_pos_data = p1_pos.data;
1053 tbl.p1_pos_step = p1_pos.step;
1055 tbl.p1_theta_data = p1_theta.data;
1056 tbl.p1_theta_step = p1_theta.step;
1058 tbl.p2_pos_data = p2_pos.data;
1059 tbl.p2_pos_step = p2_pos.step;
1061 tbl.d12_data = d12.data;
1062 tbl.d12_step = d12.step;
1064 tbl.r1_data = r1.data;
1065 tbl.r1_step = r1.step;
1067 tbl.r2_data = r2.data;
1068 tbl.r2_step = r2.step;
1070 cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
1073 struct TemplFeatureTable
1075 static __device__ float2* p1_pos(int n)
1077 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
1079 static __device__ float* p1_theta(int n)
1081 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
1083 static __device__ float2* p2_pos(int n)
1085 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
1088 static __device__ float* d12(int n)
1090 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
1093 static __device__ float2* r1(int n)
1095 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
1097 static __device__ float2* r2(int n)
1099 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
1102 struct ImageFeatureTable
1104 static __device__ float2* p1_pos(int n)
1106 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
1108 static __device__ float* p1_theta(int n)
1110 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
1112 static __device__ float2* p2_pos(int n)
1114 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
1117 static __device__ float* d12(int n)
1119 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
1122 static __device__ float2* r1(int n)
1124 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
1126 static __device__ float2* r2(int n)
1128 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
1132 __device__ float clampAngle(float a)
1136 while (res > 2.0f * CV_PI_F)
1137 res -= 2.0f * CV_PI_F;
1139 res += 2.0f * CV_PI_F;
1144 __device__ bool angleEq(float a, float b, float eps)
1146 return (::fabs(clampAngle(a - b)) <= eps);
1149 template <class FT, bool isTempl>
1150 __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
1151 int* sizes, const int maxSize,
1152 const float xi, const float angleEpsilon, const float alphaScale,
1153 const float2 center, const float maxDist)
1155 const float p1_theta = thetaList[blockIdx.x];
1156 const unsigned int coord1 = coordList[blockIdx.x];
1158 p1_pos.x = (coord1 & 0xFFFF);
1159 p1_pos.y = (coord1 >> 16) & 0xFFFF;
1161 for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
1163 const float p2_theta = thetaList[i];
1164 const unsigned int coord2 = coordList[i];
1166 p2_pos.x = (coord2 & 0xFFFF);
1167 p2_pos.y = (coord2 >> 16) & 0xFFFF;
1169 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
1171 const float2 d = p1_pos - p2_pos;
1173 float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
1174 float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
1179 float2 r1 = p1_pos - center;
1180 float2 r2 = p2_pos - center;
1182 const int n = __float2int_rn(alpha12 * alphaScale);
1184 const int ind = ::atomicAdd(sizes + n, 1);
1190 FT::p1_pos(n)[ind] = p1_pos;
1191 FT::p2_pos(n)[ind] = p2_pos;
1194 FT::p1_theta(n)[ind] = p1_theta;
1196 FT::d12(n)[ind] = d12;
1200 FT::r1(n)[ind] = r1;
1201 FT::r2(n)[ind] = r2;
1208 template <class FT, bool isTempl>
1209 void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
1210 int* sizes, int maxSize,
1211 float xi, float angleEpsilon, int levels,
1212 float2 center, float maxDist)
1214 const dim3 block(256);
1215 const dim3 grid(pointsCount);
1217 const float alphaScale = levels / (2.0f * CV_PI_F);
1219 GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
1221 xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
1223 cudaSafeCall( cudaGetLastError() );
1225 cudaSafeCall( cudaDeviceSynchronize() );
1227 thrust::device_ptr<int> sizesPtr(sizes);
1228 thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
1231 void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1232 int* sizes, int maxSize,
1233 float xi, float angleEpsilon, int levels,
1234 float2 center, float maxDist)
1236 GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
1238 xi, angleEpsilon, levels,
1241 void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1242 int* sizes, int maxSize,
1243 float xi, float angleEpsilon, int levels,
1244 float2 center, float maxDist)
1246 GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
1248 xi, angleEpsilon, levels,
1252 __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
1253 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
1255 extern __shared__ int s_OHist[];
1256 for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1260 const int tIdx = blockIdx.x;
1261 const int level = blockIdx.y;
1263 const int tSize = templSizes[level];
1267 const int imSize = imageSizes[level];
1269 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
1271 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1273 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1275 const float angle = clampAngle(im_p1_theta - t_p1_theta);
1277 if (angle >= minAngle && angle <= maxAngle)
1279 const int n = __float2int_rn((angle - minAngle) * iAngleStep);
1280 Emulation::smem::atomicAdd(&s_OHist[n], 1);
1286 for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1287 ::atomicAdd(OHist + i, s_OHist[i]);
1290 void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
1291 float minAngle, float maxAngle, float angleStep, int angleRange,
1292 int levels, int tMaxSize)
1294 const dim3 block(256);
1295 const dim3 grid(tMaxSize, levels + 1);
1297 minAngle *= (CV_PI_F / 180.0f);
1298 maxAngle *= (CV_PI_F / 180.0f);
1299 angleStep *= (CV_PI_F / 180.0f);
1301 const size_t smemSize = (angleRange + 1) * sizeof(float);
1303 GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
1304 minAngle, maxAngle, 1.0f / angleStep, angleRange);
1305 cudaSafeCall( cudaGetLastError() );
1307 cudaSafeCall( cudaDeviceSynchronize() );
1310 __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
1311 const float angle, const float angleEpsilon,
1312 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
1314 extern __shared__ int s_SHist[];
1315 for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1319 const int tIdx = blockIdx.x;
1320 const int level = blockIdx.y;
1322 const int tSize = templSizes[level];
1326 const int imSize = imageSizes[level];
1328 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1329 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
1331 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1333 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1334 const float im_d12 = ImageFeatureTable::d12(level)[i];
1336 if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1338 const float scale = im_d12 / t_d12;
1340 if (scale >= minScale && scale <= maxScale)
1342 const int s = __float2int_rn((scale - minScale) * iScaleStep);
1343 Emulation::smem::atomicAdd(&s_SHist[s], 1);
1350 for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1351 ::atomicAdd(SHist + i, s_SHist[i]);
1354 void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
1355 float angle, float angleEpsilon,
1356 float minScale, float maxScale, float iScaleStep, int scaleRange,
1357 int levels, int tMaxSize)
1359 const dim3 block(256);
1360 const dim3 grid(tMaxSize, levels + 1);
1362 angle *= (CV_PI_F / 180.0f);
1363 angleEpsilon *= (CV_PI_F / 180.0f);
1365 const size_t smemSize = (scaleRange + 1) * sizeof(float);
1367 GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
1368 angle, angleEpsilon,
1369 minScale, maxScale, iScaleStep, scaleRange);
1370 cudaSafeCall( cudaGetLastError() );
1372 cudaSafeCall( cudaDeviceSynchronize() );
1375 __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1376 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
1379 const int tIdx = blockIdx.x;
1380 const int level = blockIdx.y;
1382 const int tSize = templSizes[level];
1386 const int imSize = imageSizes[level];
1388 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1390 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
1391 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
1396 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
1397 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
1399 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1401 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1403 const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
1404 const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
1406 if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1410 c1 = im_p1_pos - r1;
1413 c2 = im_p2_pos - r2;
1416 if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
1419 if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
1420 ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
1426 void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1427 float angle, float angleEpsilon, float scale,
1429 int levels, int tMaxSize)
1431 const dim3 block(256);
1432 const dim3 grid(tMaxSize, levels + 1);
1434 angle *= (CV_PI_F / 180.0f);
1435 angleEpsilon *= (CV_PI_F / 180.0f);
1437 const float sinVal = ::sinf(angle);
1438 const float cosVal = ::cosf(angle);
1440 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
1442 GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
1443 angle, sinVal, cosVal, angleEpsilon, scale,
1445 cudaSafeCall( cudaGetLastError() );
1447 cudaSafeCall( cudaDeviceSynchronize() );
1450 __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
1451 const float angle, const int angleVotes, const float scale, const int scaleVotes,
1452 const float dp, const int threshold)
1454 const int x = blockIdx.x * blockDim.x + threadIdx.x;
1455 const int y = blockIdx.y * blockDim.y + threadIdx.y;
1457 if (x >= hist.cols - 2 || y >= hist.rows - 2)
1460 const int curVotes = hist(y + 1, x + 1);
1462 if (curVotes > threshold &&
1463 curVotes > hist(y + 1, x) &&
1464 curVotes >= hist(y + 1, x + 2) &&
1465 curVotes > hist(y, x + 1) &&
1466 curVotes >= hist(y + 2, x + 1))
1468 const int ind = ::atomicAdd(&g_counter, 1);
1472 out[ind] = make_float4(x * dp, y * dp, scale, angle);
1473 votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
1478 int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
1479 float angle, int angleVotes, float scale, int scaleVotes,
1480 float dp, int threshold)
1483 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
1485 cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
1487 const dim3 block(32, 8);
1488 const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
1490 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
1492 GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
1493 angle, angleVotes, scale, scaleVotes,
1495 cudaSafeCall( cudaGetLastError() );
1497 cudaSafeCall( cudaDeviceSynchronize() );
1500 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
1502 totalCount = ::min(totalCount, maxSize);