1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include <thrust/device_ptr.h>
46 #include <thrust/sort.h>
48 #include "opencv2/gpu/device/common.hpp"
49 #include "opencv2/gpu/device/emulation.hpp"
50 #include "opencv2/gpu/device/vec_math.hpp"
51 #include "opencv2/gpu/device/limits.hpp"
52 #include "opencv2/gpu/device/dynamic_smem.hpp"
54 namespace cv { namespace gpu { namespace device
58 __device__ int g_counter;
60 ////////////////////////////////////////////////////////////////////////
63 template <int PIXELS_PER_THREAD>
64 __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
66 __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
67 __shared__ int s_qsize[4];
68 __shared__ int s_globStart[4];
70 const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
71 const int y = blockIdx.y * blockDim.y + threadIdx.y;
74 s_qsize[threadIdx.y] = 0;
80 const uchar* srcRow = src.ptr(y);
81 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
85 const unsigned int val = (y << 16) | xx;
86 const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
87 s_queues[threadIdx.y][qidx] = val;
94 // let one thread reserve the space required in the global list
95 if (threadIdx.x == 0 && threadIdx.y == 0)
97 // find how many items are stored in each list
99 for (int i = 0; i < blockDim.y; ++i)
101 s_globStart[i] = totalSize;
102 totalSize += s_qsize[i];
105 // calculate the offset in the global list
106 const int globalOffset = atomicAdd(&g_counter, totalSize);
107 for (int i = 0; i < blockDim.y; ++i)
108 s_globStart[i] += globalOffset;
113 // copy local queues to global queue
114 const int qsize = s_qsize[threadIdx.y];
115 int gidx = s_globStart[threadIdx.y] + threadIdx.x;
116 for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
117 list[gidx] = s_queues[threadIdx.y][i];
120 int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
122 const int PIXELS_PER_THREAD = 16;
125 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
127 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
129 const dim3 block(32, 4);
130 const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
132 cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
134 buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
135 cudaSafeCall( cudaGetLastError() );
137 cudaSafeCall( cudaDeviceSynchronize() );
140 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
145 ////////////////////////////////////////////////////////////////////////
148 __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
150 const int n = blockIdx.x;
151 const float ang = n * theta;
155 sincosf(ang, &sinVal, &cosVal);
159 const int shift = (numrho - 1) / 2;
161 int* accumRow = accum.ptr(n + 1);
162 for (int i = threadIdx.x; i < count; i += blockDim.x)
164 const unsigned int val = list[i];
166 const int x = (val & 0xFFFF);
167 const int y = (val >> 16) & 0xFFFF;
169 int r = __float2int_rn(x * cosVal + y * sinVal);
172 ::atomicAdd(accumRow + r + 1, 1);
176 __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
178 int* smem = DynamicSharedMem<int>();
180 for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
185 const int n = blockIdx.x;
186 const float ang = n * theta;
190 sincosf(ang, &sinVal, &cosVal);
194 const int shift = (numrho - 1) / 2;
196 for (int i = threadIdx.x; i < count; i += blockDim.x)
198 const unsigned int val = list[i];
200 const int x = (val & 0xFFFF);
201 const int y = (val >> 16) & 0xFFFF;
203 int r = __float2int_rn(x * cosVal + y * sinVal);
206 Emulation::smem::atomicAdd(&smem[r + 1], 1);
211 int* accumRow = accum.ptr(n + 1);
212 for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
213 accumRow[i] = smem[i];
216 void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
218 const dim3 block(has20 ? 1024 : 512);
219 const dim3 grid(accum.rows - 2);
221 size_t smemSize = (accum.cols - 1) * sizeof(int);
223 if (smemSize < sharedMemPerBlock - 1000)
224 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
226 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
228 cudaSafeCall( cudaGetLastError() );
230 cudaSafeCall( cudaDeviceSynchronize() );
233 ////////////////////////////////////////////////////////////////////////
236 __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
238 const int r = blockIdx.x * blockDim.x + threadIdx.x;
239 const int n = blockIdx.y * blockDim.y + threadIdx.y;
241 if (r >= accum.cols - 2 || n >= accum.rows - 2)
244 const int curVotes = accum(n + 1, r + 1);
246 if (curVotes > threshold &&
247 curVotes > accum(n + 1, r) &&
248 curVotes >= accum(n + 1, r + 2) &&
249 curVotes > accum(n, r + 1) &&
250 curVotes >= accum(n + 2, r + 1))
252 const float radius = (r - (numrho - 1) * 0.5f) * rho;
253 const float angle = n * theta;
255 const int ind = ::atomicAdd(&g_counter, 1);
258 out[ind] = make_float2(radius, angle);
259 votes[ind] = curVotes;
264 int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
267 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
269 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
271 const dim3 block(32, 8);
272 const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
274 cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
276 linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
277 cudaSafeCall( cudaGetLastError() );
279 cudaSafeCall( cudaDeviceSynchronize() );
282 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
284 totalCount = ::min(totalCount, maxSize);
286 if (doSort && totalCount > 0)
288 thrust::device_ptr<float2> outPtr(out);
289 thrust::device_ptr<int> votesPtr(votes);
290 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
296 ////////////////////////////////////////////////////////////////////////
297 // circlesAccumCenters
299 __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
300 PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
302 const int SHIFT = 10;
303 const int ONE = 1 << SHIFT;
305 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
310 const unsigned int val = list[tid];
312 const int x = (val & 0xFFFF);
313 const int y = (val >> 16) & 0xFFFF;
315 const int vx = dx(y, x);
316 const int vy = dy(y, x);
318 if (vx == 0 && vy == 0)
321 const float mag = ::sqrtf(vx * vx + vy * vy);
323 const int x0 = __float2int_rn((x * idp) * ONE);
324 const int y0 = __float2int_rn((y * idp) * ONE);
326 int sx = __float2int_rn((vx * idp) * ONE / mag);
327 int sy = __float2int_rn((vy * idp) * ONE / mag);
329 // Step from minRadius to maxRadius in both directions of the gradient
330 for (int k1 = 0; k1 < 2; ++k1)
332 int x1 = x0 + minRadius * sx;
333 int y1 = y0 + minRadius * sy;
335 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
337 const int x2 = x1 >> SHIFT;
338 const int y2 = y1 >> SHIFT;
340 if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
343 ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
351 void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
353 const dim3 block(256);
354 const dim3 grid(divUp(count, block.x));
356 cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
358 circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
359 cudaSafeCall( cudaGetLastError() );
361 cudaSafeCall( cudaDeviceSynchronize() );
364 ////////////////////////////////////////////////////////////////////////
367 __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
369 const int x = blockIdx.x * blockDim.x + threadIdx.x;
370 const int y = blockIdx.y * blockDim.y + threadIdx.y;
372 if (x < accum.cols - 2 && y < accum.rows - 2)
374 const int top = accum(y, x + 1);
376 const int left = accum(y + 1, x);
377 const int cur = accum(y + 1, x + 1);
378 const int right = accum(y + 1, x + 2);
380 const int bottom = accum(y + 2, x + 1);
382 if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
384 const unsigned int val = (y << 16) | x;
385 const int idx = ::atomicAdd(&g_counter, 1);
391 int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
394 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
396 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
398 const dim3 block(32, 8);
399 const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
401 cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
403 buildCentersList<<<grid, block>>>(accum, centers, threshold);
404 cudaSafeCall( cudaGetLastError() );
406 cudaSafeCall( cudaDeviceSynchronize() );
409 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
414 ////////////////////////////////////////////////////////////////////////
415 // circlesAccumRadius
417 __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
418 float3* circles, const int maxCircles, const float dp,
419 const int minRadius, const int maxRadius, const int histSize, const int threshold)
421 int* smem = DynamicSharedMem<int>();
423 for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
427 unsigned int val = centers[blockIdx.x];
429 float cx = (val & 0xFFFF);
430 float cy = (val >> 16) & 0xFFFF;
432 cx = (cx + 0.5f) * dp;
433 cy = (cy + 0.5f) * dp;
435 for (int i = threadIdx.x; i < count; i += blockDim.x)
439 const int x = (val & 0xFFFF);
440 const int y = (val >> 16) & 0xFFFF;
442 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
443 if (rad >= minRadius && rad <= maxRadius)
445 const int r = __float2int_rn(rad - minRadius);
447 Emulation::smem::atomicAdd(&smem[r + 1], 1);
453 for (int i = threadIdx.x; i < histSize; i += blockDim.x)
455 const int curVotes = smem[i + 1];
457 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
459 const int ind = ::atomicAdd(&g_counter, 1);
460 if (ind < maxCircles)
461 circles[ind] = make_float3(cx, cy, i + minRadius);
466 int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
467 float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
470 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
472 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
474 const dim3 block(has20 ? 1024 : 512);
475 const dim3 grid(centersCount);
477 const int histSize = maxRadius - minRadius + 1;
478 size_t smemSize = (histSize + 2) * sizeof(int);
480 circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
481 cudaSafeCall( cudaGetLastError() );
483 cudaSafeCall( cudaDeviceSynchronize() );
486 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
488 totalCount = ::min(totalCount, maxCircles);
493 ////////////////////////////////////////////////////////////////////////
496 template <typename T, int PIXELS_PER_THREAD>
497 __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
499 __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
500 __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
501 __shared__ int s_sizes[4];
502 __shared__ int s_globStart[4];
504 const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
505 const int y = blockIdx.y * blockDim.y + threadIdx.y;
507 if (threadIdx.x == 0)
508 s_sizes[threadIdx.y] = 0;
514 const uchar* edgesRow = edges.ptr(y);
515 const T* dxRow = dx.ptr(y);
516 const T* dyRow = dy.ptr(y);
518 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
520 const T dxVal = dxRow[xx];
521 const T dyVal = dyRow[xx];
523 if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
525 const unsigned int coord = (y << 16) | xx;
527 float theta = ::atan2f(dyVal, dxVal);
529 theta += 2.0f * CV_PI_F;
531 const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
533 s_coordLists[threadIdx.y][qidx] = coord;
534 s_thetaLists[threadIdx.y][qidx] = theta;
541 // let one thread reserve the space required in the global list
542 if (threadIdx.x == 0 && threadIdx.y == 0)
544 // find how many items are stored in each list
546 for (int i = 0; i < blockDim.y; ++i)
548 s_globStart[i] = totalSize;
549 totalSize += s_sizes[i];
552 // calculate the offset in the global list
553 const int globalOffset = atomicAdd(&g_counter, totalSize);
554 for (int i = 0; i < blockDim.y; ++i)
555 s_globStart[i] += globalOffset;
560 // copy local queues to global queue
561 const int qsize = s_sizes[threadIdx.y];
562 int gidx = s_globStart[threadIdx.y] + threadIdx.x;
563 for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
565 coordList[gidx] = s_coordLists[threadIdx.y][i];
566 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
570 template <typename T>
571 int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
573 const int PIXELS_PER_THREAD = 8;
576 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
578 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
580 const dim3 block(32, 4);
581 const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
583 cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
585 buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
586 cudaSafeCall( cudaGetLastError() );
588 cudaSafeCall( cudaDeviceSynchronize() );
591 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
596 template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
597 template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
598 template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
600 __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
601 PtrStep<short2> r_table, int* r_sizes, int maxSize,
602 const short2 templCenter, const float thetaScale)
604 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
606 if (tid >= pointsCount)
609 const unsigned int coord = coordList[tid];
611 p.x = (coord & 0xFFFF);
612 p.y = (coord >> 16) & 0xFFFF;
614 const float theta = thetaList[tid];
615 const int n = __float2int_rn(theta * thetaScale);
617 const int ind = ::atomicAdd(r_sizes + n, 1);
619 r_table(n, ind) = p - templCenter;
622 void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
623 PtrStepSz<short2> r_table, int* r_sizes,
624 short2 templCenter, int levels)
626 const dim3 block(256);
627 const dim3 grid(divUp(pointsCount, block.x));
629 const float thetaScale = levels / (2.0f * CV_PI_F);
631 buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
632 cudaSafeCall( cudaGetLastError() );
634 cudaSafeCall( cudaDeviceSynchronize() );
637 ////////////////////////////////////////////////////////////////////////
640 __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
641 const PtrStep<short2> r_table, const int* r_sizes,
643 const float idp, const float thetaScale)
645 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
647 if (tid >= pointsCount)
650 const unsigned int coord = coordList[tid];
652 p.x = (coord & 0xFFFF);
653 p.y = (coord >> 16) & 0xFFFF;
655 const float theta = thetaList[tid];
656 const int n = __float2int_rn(theta * thetaScale);
658 const short2* r_row = r_table.ptr(n);
659 const int r_row_size = r_sizes[n];
661 for (int j = 0; j < r_row_size; ++j)
663 short2 c = p - r_row[j];
665 c.x = __float2int_rn(c.x * idp);
666 c.y = __float2int_rn(c.y * idp);
668 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
669 ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
673 void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
674 PtrStepSz<short2> r_table, const int* r_sizes,
676 float dp, int levels)
678 const dim3 block(256);
679 const dim3 grid(divUp(pointsCount, block.x));
681 const float idp = 1.0f / dp;
682 const float thetaScale = levels / (2.0f * CV_PI_F);
684 GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
685 cudaSafeCall( cudaGetLastError() );
687 cudaSafeCall( cudaDeviceSynchronize() );
690 __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
692 const int x = blockIdx.x * blockDim.x + threadIdx.x;
693 const int y = blockIdx.y * blockDim.y + threadIdx.y;
695 if (x >= hist.cols - 2 || y >= hist.rows - 2)
698 const int curVotes = hist(y + 1, x + 1);
700 if (curVotes > threshold &&
701 curVotes > hist(y + 1, x) &&
702 curVotes >= hist(y + 1, x + 2) &&
703 curVotes > hist(y, x + 1) &&
704 curVotes >= hist(y + 2, x + 1))
706 const int ind = ::atomicAdd(&g_counter, 1);
710 out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
711 votes[ind] = make_int3(curVotes, 0, 0);
716 int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
719 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
721 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
723 const dim3 block(32, 8);
724 const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
726 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
728 GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
729 cudaSafeCall( cudaGetLastError() );
731 cudaSafeCall( cudaDeviceSynchronize() );
734 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
736 totalCount = ::min(totalCount, maxSize);
741 ////////////////////////////////////////////////////////////////////////
742 // GHT_Ballard_PosScale
744 __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
745 PtrStep<short2> r_table, const int* r_sizes,
746 PtrStepi hist, const int rows, const int cols,
747 const float minScale, const float scaleStep, const int scaleRange,
748 const float idp, const float thetaScale)
750 const unsigned int coord = coordList[blockIdx.x];
752 p.x = (coord & 0xFFFF);
753 p.y = (coord >> 16) & 0xFFFF;
755 const float theta = thetaList[blockIdx.x];
756 const int n = __float2int_rn(theta * thetaScale);
758 const short2* r_row = r_table.ptr(n);
759 const int r_row_size = r_sizes[n];
761 for (int j = 0; j < r_row_size; ++j)
763 const float2 d = saturate_cast<float2>(r_row[j]);
765 for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
767 const float scale = minScale + s * scaleStep;
769 float2 c = p - scale * d;
774 if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
775 ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
780 void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
781 PtrStepSz<short2> r_table, const int* r_sizes,
782 PtrStepi hist, int rows, int cols,
783 float minScale, float scaleStep, int scaleRange,
784 float dp, int levels)
786 const dim3 block(256);
787 const dim3 grid(pointsCount);
789 const float idp = 1.0f / dp;
790 const float thetaScale = levels / (2.0f * CV_PI_F);
792 GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
795 minScale, scaleStep, scaleRange,
797 cudaSafeCall( cudaGetLastError() );
799 cudaSafeCall( cudaDeviceSynchronize() );
802 __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
803 float4* out, int3* votes, const int maxSize,
804 const float minScale, const float scaleStep, const float dp, const int threshold)
806 const int x = blockIdx.x * blockDim.x + threadIdx.x;
807 const int y = blockIdx.y * blockDim.y + threadIdx.y;
809 if (x >= cols || y >= rows)
812 for (int s = 0; s < scaleRange; ++s)
814 const float scale = minScale + s * scaleStep;
816 const int prevScaleIdx = (s) * (rows + 2);
817 const int curScaleIdx = (s + 1) * (rows + 2);
818 const int nextScaleIdx = (s + 2) * (rows + 2);
820 const int curVotes = hist(curScaleIdx + y + 1, x + 1);
822 if (curVotes > threshold &&
823 curVotes > hist(curScaleIdx + y + 1, x) &&
824 curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
825 curVotes > hist(curScaleIdx + y, x + 1) &&
826 curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
827 curVotes > hist(prevScaleIdx + y + 1, x + 1) &&
828 curVotes >= hist(nextScaleIdx + y + 1, x + 1))
830 const int ind = ::atomicAdd(&g_counter, 1);
834 out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
835 votes[ind] = make_int3(curVotes, curVotes, 0);
841 int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
842 float minScale, float scaleStep, float dp, int threshold)
845 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
847 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
849 const dim3 block(32, 8);
850 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
852 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
854 GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
855 cudaSafeCall( cudaGetLastError() );
857 cudaSafeCall( cudaDeviceSynchronize() );
860 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
862 totalCount = ::min(totalCount, maxSize);
867 ////////////////////////////////////////////////////////////////////////
868 // GHT_Ballard_PosRotation
870 __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
871 PtrStep<short2> r_table, const int* r_sizes,
872 PtrStepi hist, const int rows, const int cols,
873 const float minAngle, const float angleStep, const int angleRange,
874 const float idp, const float thetaScale)
876 const unsigned int coord = coordList[blockIdx.x];
878 p.x = (coord & 0xFFFF);
879 p.y = (coord >> 16) & 0xFFFF;
881 const float thetaVal = thetaList[blockIdx.x];
883 for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
885 const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
887 sincosf(angle, &sinA, &cosA);
889 float theta = thetaVal - angle;
891 theta += 2.0f * CV_PI_F;
893 const int n = __float2int_rn(theta * thetaScale);
895 const short2* r_row = r_table.ptr(n);
896 const int r_row_size = r_sizes[n];
898 for (int j = 0; j < r_row_size; ++j)
900 const float2 d = saturate_cast<float2>(r_row[j]);
902 const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
904 float2 c = make_float2(p.x - dr.x, p.y - dr.y);
908 if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
909 ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
914 void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
915 PtrStepSz<short2> r_table, const int* r_sizes,
916 PtrStepi hist, int rows, int cols,
917 float minAngle, float angleStep, int angleRange,
918 float dp, int levels)
920 const dim3 block(256);
921 const dim3 grid(pointsCount);
923 const float idp = 1.0f / dp;
924 const float thetaScale = levels / (2.0f * CV_PI_F);
926 GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
929 minAngle, angleStep, angleRange,
931 cudaSafeCall( cudaGetLastError() );
933 cudaSafeCall( cudaDeviceSynchronize() );
936 __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
937 float4* out, int3* votes, const int maxSize,
938 const float minAngle, const float angleStep, const float dp, const int threshold)
940 const int x = blockIdx.x * blockDim.x + threadIdx.x;
941 const int y = blockIdx.y * blockDim.y + threadIdx.y;
943 if (x >= cols || y >= rows)
946 for (int a = 0; a < angleRange; ++a)
948 const float angle = minAngle + a * angleStep;
950 const int prevAngleIdx = (a) * (rows + 2);
951 const int curAngleIdx = (a + 1) * (rows + 2);
952 const int nextAngleIdx = (a + 2) * (rows + 2);
954 const int curVotes = hist(curAngleIdx + y + 1, x + 1);
956 if (curVotes > threshold &&
957 curVotes > hist(curAngleIdx + y + 1, x) &&
958 curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
959 curVotes > hist(curAngleIdx + y, x + 1) &&
960 curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
961 curVotes > hist(prevAngleIdx + y + 1, x + 1) &&
962 curVotes >= hist(nextAngleIdx + y + 1, x + 1))
964 const int ind = ::atomicAdd(&g_counter, 1);
968 out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
969 votes[ind] = make_int3(curVotes, 0, curVotes);
975 int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
976 float minAngle, float angleStep, float dp, int threshold)
979 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
981 cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
983 const dim3 block(32, 8);
984 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
986 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
988 GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
989 cudaSafeCall( cudaGetLastError() );
991 cudaSafeCall( cudaDeviceSynchronize() );
994 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
996 totalCount = ::min(totalCount, maxSize);
1001 ////////////////////////////////////////////////////////////////////////
1009 uchar* p1_theta_data;
1010 size_t p1_theta_step;
1025 __constant__ FeatureTable c_templFeatures;
1026 __constant__ FeatureTable c_imageFeatures;
1028 void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1032 tbl.p1_pos_data = p1_pos.data;
1033 tbl.p1_pos_step = p1_pos.step;
1035 tbl.p1_theta_data = p1_theta.data;
1036 tbl.p1_theta_step = p1_theta.step;
1038 tbl.p2_pos_data = p2_pos.data;
1039 tbl.p2_pos_step = p2_pos.step;
1041 tbl.d12_data = d12.data;
1042 tbl.d12_step = d12.step;
1044 tbl.r1_data = r1.data;
1045 tbl.r1_step = r1.step;
1047 tbl.r2_data = r2.data;
1048 tbl.r2_step = r2.step;
1050 cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
1052 void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1056 tbl.p1_pos_data = p1_pos.data;
1057 tbl.p1_pos_step = p1_pos.step;
1059 tbl.p1_theta_data = p1_theta.data;
1060 tbl.p1_theta_step = p1_theta.step;
1062 tbl.p2_pos_data = p2_pos.data;
1063 tbl.p2_pos_step = p2_pos.step;
1065 tbl.d12_data = d12.data;
1066 tbl.d12_step = d12.step;
1068 tbl.r1_data = r1.data;
1069 tbl.r1_step = r1.step;
1071 tbl.r2_data = r2.data;
1072 tbl.r2_step = r2.step;
1074 cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
1077 struct TemplFeatureTable
1079 static __device__ float2* p1_pos(int n)
1081 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
1083 static __device__ float* p1_theta(int n)
1085 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
1087 static __device__ float2* p2_pos(int n)
1089 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
1092 static __device__ float* d12(int n)
1094 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
1097 static __device__ float2* r1(int n)
1099 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
1101 static __device__ float2* r2(int n)
1103 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
1106 struct ImageFeatureTable
1108 static __device__ float2* p1_pos(int n)
1110 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
1112 static __device__ float* p1_theta(int n)
1114 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
1116 static __device__ float2* p2_pos(int n)
1118 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
1121 static __device__ float* d12(int n)
1123 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
1126 static __device__ float2* r1(int n)
1128 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
1130 static __device__ float2* r2(int n)
1132 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
1136 __device__ float clampAngle(float a)
1140 while (res > 2.0f * CV_PI_F)
1141 res -= 2.0f * CV_PI_F;
1143 res += 2.0f * CV_PI_F;
1148 __device__ bool angleEq(float a, float b, float eps)
1150 return (::fabs(clampAngle(a - b)) <= eps);
1153 template <class FT, bool isTempl>
1154 __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
1155 int* sizes, const int maxSize,
1156 const float xi, const float angleEpsilon, const float alphaScale,
1157 const float2 center, const float maxDist)
1159 const float p1_theta = thetaList[blockIdx.x];
1160 const unsigned int coord1 = coordList[blockIdx.x];
1162 p1_pos.x = (coord1 & 0xFFFF);
1163 p1_pos.y = (coord1 >> 16) & 0xFFFF;
1165 for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
1167 const float p2_theta = thetaList[i];
1168 const unsigned int coord2 = coordList[i];
1170 p2_pos.x = (coord2 & 0xFFFF);
1171 p2_pos.y = (coord2 >> 16) & 0xFFFF;
1173 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
1175 const float2 d = p1_pos - p2_pos;
1177 float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
1178 float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
1183 float2 r1 = p1_pos - center;
1184 float2 r2 = p2_pos - center;
1186 const int n = __float2int_rn(alpha12 * alphaScale);
1188 const int ind = ::atomicAdd(sizes + n, 1);
1194 FT::p1_pos(n)[ind] = p1_pos;
1195 FT::p2_pos(n)[ind] = p2_pos;
1198 FT::p1_theta(n)[ind] = p1_theta;
1200 FT::d12(n)[ind] = d12;
1204 FT::r1(n)[ind] = r1;
1205 FT::r2(n)[ind] = r2;
1212 template <class FT, bool isTempl>
1213 void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
1214 int* sizes, int maxSize,
1215 float xi, float angleEpsilon, int levels,
1216 float2 center, float maxDist)
1218 const dim3 block(256);
1219 const dim3 grid(pointsCount);
1221 const float alphaScale = levels / (2.0f * CV_PI_F);
1223 GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
1225 xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
1227 cudaSafeCall( cudaGetLastError() );
1229 cudaSafeCall( cudaDeviceSynchronize() );
1231 thrust::device_ptr<int> sizesPtr(sizes);
1232 thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
1235 void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1236 int* sizes, int maxSize,
1237 float xi, float angleEpsilon, int levels,
1238 float2 center, float maxDist)
1240 GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
1242 xi, angleEpsilon, levels,
1245 void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1246 int* sizes, int maxSize,
1247 float xi, float angleEpsilon, int levels,
1248 float2 center, float maxDist)
1250 GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
1252 xi, angleEpsilon, levels,
1256 __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
1257 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
1259 extern __shared__ int s_OHist[];
1260 for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1264 const int tIdx = blockIdx.x;
1265 const int level = blockIdx.y;
1267 const int tSize = templSizes[level];
1271 const int imSize = imageSizes[level];
1273 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
1275 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1277 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1279 const float angle = clampAngle(im_p1_theta - t_p1_theta);
1281 if (angle >= minAngle && angle <= maxAngle)
1283 const int n = __float2int_rn((angle - minAngle) * iAngleStep);
1284 Emulation::smem::atomicAdd(&s_OHist[n], 1);
1290 for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1291 ::atomicAdd(OHist + i, s_OHist[i]);
1294 void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
1295 float minAngle, float maxAngle, float angleStep, int angleRange,
1296 int levels, int tMaxSize)
1298 const dim3 block(256);
1299 const dim3 grid(tMaxSize, levels + 1);
1301 minAngle *= (CV_PI_F / 180.0f);
1302 maxAngle *= (CV_PI_F / 180.0f);
1303 angleStep *= (CV_PI_F / 180.0f);
1305 const size_t smemSize = (angleRange + 1) * sizeof(float);
1307 GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
1308 minAngle, maxAngle, 1.0f / angleStep, angleRange);
1309 cudaSafeCall( cudaGetLastError() );
1311 cudaSafeCall( cudaDeviceSynchronize() );
1314 __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
1315 const float angle, const float angleEpsilon,
1316 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
1318 extern __shared__ int s_SHist[];
1319 for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1323 const int tIdx = blockIdx.x;
1324 const int level = blockIdx.y;
1326 const int tSize = templSizes[level];
1330 const int imSize = imageSizes[level];
1332 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1333 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
1335 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1337 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1338 const float im_d12 = ImageFeatureTable::d12(level)[i];
1340 if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1342 const float scale = im_d12 / t_d12;
1344 if (scale >= minScale && scale <= maxScale)
1346 const int s = __float2int_rn((scale - minScale) * iScaleStep);
1347 Emulation::smem::atomicAdd(&s_SHist[s], 1);
1354 for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1355 ::atomicAdd(SHist + i, s_SHist[i]);
1358 void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
1359 float angle, float angleEpsilon,
1360 float minScale, float maxScale, float iScaleStep, int scaleRange,
1361 int levels, int tMaxSize)
1363 const dim3 block(256);
1364 const dim3 grid(tMaxSize, levels + 1);
1366 angle *= (CV_PI_F / 180.0f);
1367 angleEpsilon *= (CV_PI_F / 180.0f);
1369 const size_t smemSize = (scaleRange + 1) * sizeof(float);
1371 GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
1372 angle, angleEpsilon,
1373 minScale, maxScale, iScaleStep, scaleRange);
1374 cudaSafeCall( cudaGetLastError() );
1376 cudaSafeCall( cudaDeviceSynchronize() );
1379 __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1380 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
1383 const int tIdx = blockIdx.x;
1384 const int level = blockIdx.y;
1386 const int tSize = templSizes[level];
1390 const int imSize = imageSizes[level];
1392 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1394 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
1395 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
1400 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
1401 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
1403 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1405 const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1407 const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
1408 const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
1410 if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1414 c1 = im_p1_pos - r1;
1417 c2 = im_p2_pos - r2;
1420 if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
1423 if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
1424 ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
1430 void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1431 float angle, float angleEpsilon, float scale,
1433 int levels, int tMaxSize)
1435 const dim3 block(256);
1436 const dim3 grid(tMaxSize, levels + 1);
1438 angle *= (CV_PI_F / 180.0f);
1439 angleEpsilon *= (CV_PI_F / 180.0f);
1441 const float sinVal = ::sinf(angle);
1442 const float cosVal = ::cosf(angle);
1444 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
1446 GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
1447 angle, sinVal, cosVal, angleEpsilon, scale,
1449 cudaSafeCall( cudaGetLastError() );
1451 cudaSafeCall( cudaDeviceSynchronize() );
1454 __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
1455 const float angle, const int angleVotes, const float scale, const int scaleVotes,
1456 const float dp, const int threshold)
1458 const int x = blockIdx.x * blockDim.x + threadIdx.x;
1459 const int y = blockIdx.y * blockDim.y + threadIdx.y;
1461 if (x >= hist.cols - 2 || y >= hist.rows - 2)
1464 const int curVotes = hist(y + 1, x + 1);
1466 if (curVotes > threshold &&
1467 curVotes > hist(y + 1, x) &&
1468 curVotes >= hist(y + 1, x + 2) &&
1469 curVotes > hist(y, x + 1) &&
1470 curVotes >= hist(y + 2, x + 1))
1472 const int ind = ::atomicAdd(&g_counter, 1);
1476 out[ind] = make_float4(x * dp, y * dp, scale, angle);
1477 votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
1482 int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
1483 float angle, int angleVotes, float scale, int scaleVotes,
1484 float dp, int threshold)
1487 cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
1489 cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
1491 const dim3 block(32, 8);
1492 const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
1494 cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
1496 GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
1497 angle, angleVotes, scale, scaleVotes,
1499 cudaSafeCall( cudaGetLastError() );
1501 cudaSafeCall( cudaDeviceSynchronize() );
1504 cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
1506 totalCount = ::min(totalCount, maxSize);
1514 #endif /* CUDA_DISABLER */