added Generalized Hough implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / hough.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include <thrust/sort.h>
44 #include "opencv2/gpu/device/common.hpp"
45 #include "opencv2/gpu/device/emulation.hpp"
46 #include "opencv2/gpu/device/vec_math.hpp"
47 #include "opencv2/gpu/device/limits.hpp"
48 #include "opencv2/gpu/device/dynamic_smem.hpp"
49
50 namespace cv { namespace gpu { namespace device
51 {
52     namespace hough
53     {
54         __device__ int g_counter;
55
56         ////////////////////////////////////////////////////////////////////////
57         // buildPointList
58
59         template <int PIXELS_PER_THREAD>
60         __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
61         {
62             __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
63             __shared__ int s_qsize[4];
64             __shared__ int s_globStart[4];
65
66             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
67             const int y = blockIdx.y * blockDim.y + threadIdx.y;
68
69             if (threadIdx.x == 0)
70                 s_qsize[threadIdx.y] = 0;
71             __syncthreads();
72
73             if (y < src.rows)
74             {
75                 // fill the queue
76                 const uchar* srcRow = src.ptr(y);
77                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
78                 {
79                     if (srcRow[xx])
80                     {
81                         const unsigned int val = (y << 16) | xx;
82                         const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
83                         s_queues[threadIdx.y][qidx] = val;
84                     }
85                 }
86             }
87
88             __syncthreads();
89
90             // let one thread reserve the space required in the global list
91             if (threadIdx.x == 0 && threadIdx.y == 0)
92             {
93                 // find how many items are stored in each list
94                 int totalSize = 0;
95                 for (int i = 0; i < blockDim.y; ++i)
96                 {
97                     s_globStart[i] = totalSize;
98                     totalSize += s_qsize[i];
99                 }
100
101                 // calculate the offset in the global list
102                 const int globalOffset = atomicAdd(&g_counter, totalSize);
103                 for (int i = 0; i < blockDim.y; ++i)
104                     s_globStart[i] += globalOffset;
105             }
106
107             __syncthreads();
108
109             // copy local queues to global queue
110             const int qsize = s_qsize[threadIdx.y];
111             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
112             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
113                 list[gidx] = s_queues[threadIdx.y][i];
114         }
115
116         int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
117         {
118             const int PIXELS_PER_THREAD = 16;
119
120             void* counterPtr;
121             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
122
123             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
124
125             const dim3 block(32, 4);
126             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
127
128             cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
129
130             buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
131             cudaSafeCall( cudaGetLastError() );
132
133             cudaSafeCall( cudaDeviceSynchronize() );
134
135             int totalCount;
136             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
137
138             return totalCount;
139         }
140
141         ////////////////////////////////////////////////////////////////////////
142         // linesAccum
143
144         __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
145         {
146             const int n = blockIdx.x;
147             const float ang = n * theta;
148
149             float sinVal;
150             float cosVal;
151             sincosf(ang, &sinVal, &cosVal);
152             sinVal *= irho;
153             cosVal *= irho;
154
155             const int shift = (numrho - 1) / 2;
156
157             int* accumRow = accum.ptr(n + 1);
158             for (int i = threadIdx.x; i < count; i += blockDim.x)
159             {
160                 const unsigned int val = list[i];
161
162                 const int x = (val & 0xFFFF);
163                 const int y = (val >> 16) & 0xFFFF;
164
165                 int r = __float2int_rn(x * cosVal + y * sinVal);
166                 r += shift;
167
168                 ::atomicAdd(accumRow + r + 1, 1);
169             }
170         }
171
172         __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
173         {
174             int* smem = DynamicSharedMem<int>();
175
176             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
177                 smem[i] = 0;
178
179             __syncthreads();
180
181             const int n = blockIdx.x;
182             const float ang = n * theta;
183
184             float sinVal;
185             float cosVal;
186             sincosf(ang, &sinVal, &cosVal);
187             sinVal *= irho;
188             cosVal *= irho;
189
190             const int shift = (numrho - 1) / 2;
191
192             for (int i = threadIdx.x; i < count; i += blockDim.x)
193             {
194                 const unsigned int val = list[i];
195
196                 const int x = (val & 0xFFFF);
197                 const int y = (val >> 16) & 0xFFFF;
198
199                 int r = __float2int_rn(x * cosVal + y * sinVal);
200                 r += shift;
201
202                 Emulation::smem::atomicAdd(&smem[r + 1], 1);
203             }
204
205             __syncthreads();
206
207             int* accumRow = accum.ptr(n + 1);
208             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
209                 accumRow[i] = smem[i];
210         }
211
212         void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
213         {
214             const dim3 block(has20 ? 1024 : 512);
215             const dim3 grid(accum.rows - 2);
216
217             size_t smemSize = (accum.cols - 1) * sizeof(int);
218
219             if (smemSize < sharedMemPerBlock - 1000)
220                 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
221             else
222                 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
223
224             cudaSafeCall( cudaGetLastError() );
225
226             cudaSafeCall( cudaDeviceSynchronize() );
227         }
228
229         ////////////////////////////////////////////////////////////////////////
230         // linesGetResult
231
232         __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
233         {
234             const int r = blockIdx.x * blockDim.x + threadIdx.x;
235             const int n = blockIdx.y * blockDim.y + threadIdx.y;
236
237             if (r >= accum.cols - 2 && n >= accum.rows - 2)
238                 return;
239
240             const int curVotes = accum(n + 1, r + 1);
241
242             if (curVotes > threshold &&
243                 curVotes >  accum(n + 1, r) &&
244                 curVotes >= accum(n + 1, r + 2) &&
245                 curVotes >  accum(n, r + 1) &&
246                 curVotes >= accum(n + 2, r + 1))
247             {
248                 const float radius = (r - (numrho - 1) * 0.5f) * rho;
249                 const float angle = n * theta;
250
251                 const int ind = ::atomicAdd(&g_counter, 1);
252                 if (ind < maxSize)
253                 {
254                     out[ind] = make_float2(radius, angle);
255                     votes[ind] = curVotes;
256                 }
257             }
258         }
259
260         int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
261         {
262             void* counterPtr;
263             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
264
265             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
266
267             const dim3 block(32, 8);
268             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
269
270             cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
271
272             linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
273             cudaSafeCall( cudaGetLastError() );
274
275             cudaSafeCall( cudaDeviceSynchronize() );
276
277             int totalCount;
278             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
279
280             totalCount = ::min(totalCount, maxSize);
281
282             if (doSort && totalCount > 0)
283             {
284                 thrust::device_ptr<float2> outPtr(out);
285                 thrust::device_ptr<int> votesPtr(votes);
286                 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
287             }
288
289             return totalCount;
290         }
291
292         ////////////////////////////////////////////////////////////////////////
293         // circlesAccumCenters
294
295         __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
296                                             PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
297         {
298             const int SHIFT = 10;
299             const int ONE = 1 << SHIFT;
300
301             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
302
303             if (tid >= count)
304                 return;
305
306             const unsigned int val = list[tid];
307
308             const int x = (val & 0xFFFF);
309             const int y = (val >> 16) & 0xFFFF;
310
311             const int vx = dx(y, x);
312             const int vy = dy(y, x);
313
314             if (vx == 0 && vy == 0)
315                 return;
316
317             const float mag = ::sqrtf(vx * vx + vy * vy);
318
319             const int x0 = __float2int_rn((x * idp) * ONE);
320             const int y0 = __float2int_rn((y * idp) * ONE);
321
322             int sx = __float2int_rn((vx * idp) * ONE / mag);
323             int sy = __float2int_rn((vy * idp) * ONE / mag);
324
325             // Step from minRadius to maxRadius in both directions of the gradient
326             for (int k1 = 0; k1 < 2; ++k1)
327             {
328                 int x1 = x0 + minRadius * sx;
329                 int y1 = y0 + minRadius * sy;
330
331                 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
332                 {
333                     const int x2 = x1 >> SHIFT;
334                     const int y2 = y1 >> SHIFT;
335
336                     if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
337                         break;
338
339                     ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
340                 }
341
342                 sx = -sx;
343                 sy = -sy;
344             }
345         }
346
347         void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
348         {
349             const dim3 block(256);
350             const dim3 grid(divUp(count, block.x));
351
352             cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
353
354             circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
355             cudaSafeCall( cudaGetLastError() );
356
357             cudaSafeCall( cudaDeviceSynchronize() );
358         }
359
360         ////////////////////////////////////////////////////////////////////////
361         // buildCentersList
362
363         __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
364         {
365             const int x = blockIdx.x * blockDim.x + threadIdx.x;
366             const int y = blockIdx.y * blockDim.y + threadIdx.y;
367
368             if (x < accum.cols - 2 && y < accum.rows - 2)
369             {
370                 const int top = accum(y, x + 1);
371
372                 const int left = accum(y + 1, x);
373                 const int cur = accum(y + 1, x + 1);
374                 const int right = accum(y + 1, x + 2);
375
376                 const int bottom = accum(y + 2, x + 1);
377
378                 if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
379                 {
380                     const unsigned int val = (y << 16) | x;
381                     const int idx = ::atomicAdd(&g_counter, 1);
382                     centers[idx] = val;
383                 }
384             }
385         }
386
387         int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
388         {
389             void* counterPtr;
390             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
391
392             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
393
394             const dim3 block(32, 8);
395             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
396
397             cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
398
399             buildCentersList<<<grid, block>>>(accum, centers, threshold);
400             cudaSafeCall( cudaGetLastError() );
401
402             cudaSafeCall( cudaDeviceSynchronize() );
403
404             int totalCount;
405             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
406
407             return totalCount;
408         }
409
410         ////////////////////////////////////////////////////////////////////////
411         // circlesAccumRadius
412
413         __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
414                                            float3* circles, const int maxCircles, const float dp,
415                                            const int minRadius, const int maxRadius, const int histSize, const int threshold)
416         {
417             int* smem = DynamicSharedMem<int>();
418
419             for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
420                 smem[i] = 0;
421             __syncthreads();
422
423             unsigned int val = centers[blockIdx.x];
424
425             float cx = (val & 0xFFFF);
426             float cy = (val >> 16) & 0xFFFF;
427
428             cx = (cx + 0.5f) * dp;
429             cy = (cy + 0.5f) * dp;
430
431             for (int i = threadIdx.x; i < count; i += blockDim.x)
432             {
433                 val = list[i];
434
435                 const int x = (val & 0xFFFF);
436                 const int y = (val >> 16) & 0xFFFF;
437
438                 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
439                 if (rad >= minRadius && rad <= maxRadius)
440                 {
441                     const int r = __float2int_rn(rad - minRadius);
442
443                     Emulation::smem::atomicAdd(&smem[r + 1], 1);
444                 }
445             }
446
447             __syncthreads();
448
449             for (int i = threadIdx.x; i < histSize; i += blockDim.x)
450             {
451                 const int curVotes = smem[i + 1];
452
453                 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
454                 {
455                     const int ind = ::atomicAdd(&g_counter, 1);
456                     if (ind < maxCircles)
457                         circles[ind] = make_float3(cx, cy, i + minRadius);
458                 }
459             }
460         }
461
462         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
463                                    float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
464         {
465             void* counterPtr;
466             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
467
468             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
469
470             const dim3 block(has20 ? 1024 : 512);
471             const dim3 grid(centersCount);
472
473             const int histSize = maxRadius - minRadius + 1;
474             size_t smemSize = (histSize + 2) * sizeof(int);
475
476             circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
477             cudaSafeCall( cudaGetLastError() );
478
479             cudaSafeCall( cudaDeviceSynchronize() );
480
481             int totalCount;
482             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
483
484             totalCount = ::min(totalCount, maxCircles);
485
486             return totalCount;
487         }
488
489         ////////////////////////////////////////////////////////////////////////
490         // Generalized Hough
491
492         template <typename T, int PIXELS_PER_THREAD>
493         __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
494         {
495             __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
496             __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
497             __shared__ int s_sizes[4];
498             __shared__ int s_globStart[4];
499
500             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
501             const int y = blockIdx.y * blockDim.y + threadIdx.y;
502
503             if (threadIdx.x == 0)
504                 s_sizes[threadIdx.y] = 0;
505             __syncthreads();
506
507             if (y < edges.rows)
508             {
509                 // fill the queue
510                 const uchar* edgesRow = edges.ptr(y);
511                 const T* dxRow = dx.ptr(y);
512                 const T* dyRow = dy.ptr(y);
513
514                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
515                 {
516                     const T dxVal = dxRow[xx];
517                     const T dyVal = dyRow[xx];
518
519                     if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
520                     {
521                         const unsigned int coord = (y << 16) | xx;
522
523                         float theta = ::atan2f(dyVal, dxVal);
524                         if (theta < 0)
525                             theta += 2.0f * CV_PI_F;
526
527                         const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
528
529                         s_coordLists[threadIdx.y][qidx] = coord;
530                         s_thetaLists[threadIdx.y][qidx] = theta;
531                     }
532                 }
533             }
534
535             __syncthreads();
536
537             // let one thread reserve the space required in the global list
538             if (threadIdx.x == 0 && threadIdx.y == 0)
539             {
540                 // find how many items are stored in each list
541                 int totalSize = 0;
542                 for (int i = 0; i < blockDim.y; ++i)
543                 {
544                     s_globStart[i] = totalSize;
545                     totalSize += s_sizes[i];
546                 }
547
548                 // calculate the offset in the global list
549                 const int globalOffset = atomicAdd(&g_counter, totalSize);
550                 for (int i = 0; i < blockDim.y; ++i)
551                     s_globStart[i] += globalOffset;
552             }
553
554             __syncthreads();
555
556             // copy local queues to global queue
557             const int qsize = s_sizes[threadIdx.y];
558             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
559             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
560             {
561                 coordList[gidx] = s_coordLists[threadIdx.y][i];
562                 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
563             }
564         }
565
566         template <typename T>
567         int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
568         {
569             const int PIXELS_PER_THREAD = 8;
570
571             void* counterPtr;
572             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
573
574             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
575
576             const dim3 block(32, 4);
577             const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
578
579             cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
580
581             buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
582             cudaSafeCall( cudaGetLastError() );
583
584             cudaSafeCall( cudaDeviceSynchronize() );
585
586             int totalCount;
587             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
588
589             return totalCount;
590         }
591
592         template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
593         template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
594         template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
595
596         __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
597                                     PtrStep<short2> r_table, int* r_sizes, int maxSize,
598                                     const short2 templCenter, const float thetaScale)
599         {
600             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
601
602             if (tid >= pointsCount)
603                 return;
604
605             const unsigned int coord = coordList[tid];
606             short2 p;
607             p.x = (coord & 0xFFFF);
608             p.y = (coord >> 16) & 0xFFFF;
609
610             const float theta = thetaList[tid];
611             const int n = __float2int_rn(theta * thetaScale);
612
613             const int ind = ::atomicAdd(r_sizes + n, 1);
614             if (ind < maxSize)
615                 r_table(n, ind) = p - templCenter;
616         }
617
618         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
619                              PtrStepSz<short2> r_table, int* r_sizes,
620                              short2 templCenter, int levels)
621         {
622             const dim3 block(256);
623             const dim3 grid(divUp(pointsCount, block.x));
624
625             const float thetaScale = levels / (2.0f * CV_PI_F);
626
627             buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
628             cudaSafeCall( cudaGetLastError() );
629
630             cudaSafeCall( cudaDeviceSynchronize() );
631         }
632
633         ////////////////////////////////////////////////////////////////////////
634         // GHT_Ballard_Pos
635
636         __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
637                                                  const PtrStep<short2> r_table, const int* r_sizes,
638                                                  PtrStepSzi hist,
639                                                  const float idp, const float thetaScale)
640         {
641             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
642
643             if (tid >= pointsCount)
644                 return;
645
646             const unsigned int coord = coordList[tid];
647             short2 p;
648             p.x = (coord & 0xFFFF);
649             p.y = (coord >> 16) & 0xFFFF;
650
651             const float theta = thetaList[tid];
652             const int n = __float2int_rn(theta * thetaScale);
653
654             const short2* r_row = r_table.ptr(n);
655             const int r_row_size = r_sizes[n];
656
657             for (int j = 0; j < r_row_size; ++j)
658             {
659                 short2 c = p - r_row[j];
660
661                 c.x = __float2int_rn(c.x * idp);
662                 c.y = __float2int_rn(c.y * idp);
663
664                 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
665                     ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
666             }
667         }
668
669         void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
670                                           PtrStepSz<short2> r_table, const int* r_sizes,
671                                           PtrStepSzi hist,
672                                           float dp, int levels)
673         {
674             const dim3 block(256);
675             const dim3 grid(divUp(pointsCount, block.x));
676
677             const float idp = 1.0f / dp;
678             const float thetaScale = levels / (2.0f * CV_PI_F);
679
680             GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
681             cudaSafeCall( cudaGetLastError() );
682
683             cudaSafeCall( cudaDeviceSynchronize() );
684         }
685
686         __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
687         {
688             const int x = blockIdx.x * blockDim.x + threadIdx.x;
689             const int y = blockIdx.y * blockDim.y + threadIdx.y;
690
691             if (x >= hist.cols - 2 || y >= hist.rows - 2)
692                 return;
693
694             const int curVotes = hist(y + 1, x + 1);
695
696             if (curVotes > threshold &&
697                 curVotes >  hist(y + 1, x) &&
698                 curVotes >= hist(y + 1, x + 2) &&
699                 curVotes >  hist(y, x + 1) &&
700                 curVotes >= hist(y + 2, x + 1))
701             {
702                 const int ind = ::atomicAdd(&g_counter, 1);
703
704                 if (ind < maxSize)
705                 {
706                     out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
707                     votes[ind] = make_int3(curVotes, 0, 0);
708                 }
709             }
710         }
711
712         int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
713         {
714             void* counterPtr;
715             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
716
717             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
718
719             const dim3 block(32, 8);
720             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
721
722             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
723
724             GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
725             cudaSafeCall( cudaGetLastError() );
726
727             cudaSafeCall( cudaDeviceSynchronize() );
728
729             int totalCount;
730             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
731
732             totalCount = ::min(totalCount, maxSize);
733
734             return totalCount;
735         }
736
737         ////////////////////////////////////////////////////////////////////////
738         // GHT_Ballard_PosScale
739
740         __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
741                                                       PtrStep<short2> r_table, const int* r_sizes,
742                                                       PtrStepi hist, const int rows, const int cols,
743                                                       const float minScale, const float scaleStep, const int scaleRange,
744                                                       const float idp, const float thetaScale)
745         {
746             const unsigned int coord = coordList[blockIdx.x];
747             float2 p;
748             p.x = (coord & 0xFFFF);
749             p.y = (coord >> 16) & 0xFFFF;
750
751             const float theta = thetaList[blockIdx.x];
752             const int n = __float2int_rn(theta * thetaScale);
753
754             const short2* r_row = r_table.ptr(n);
755             const int r_row_size = r_sizes[n];
756
757             for (int j = 0; j < r_row_size; ++j)
758             {
759                 const float2 d = saturate_cast<float2>(r_row[j]);
760
761                 for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
762                 {
763                     const float scale = minScale + s * scaleStep;
764
765                     float2 c = p - scale * d;
766
767                     c.x *= idp;
768                     c.y *= idp;
769
770                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
771                         ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
772                 }
773             }
774         }
775
776         void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
777                                                PtrStepSz<short2> r_table, const int* r_sizes,
778                                                PtrStepi hist, int rows, int cols,
779                                                float minScale, float scaleStep, int scaleRange,
780                                                float dp, int levels)
781         {
782             const dim3 block(256);
783             const dim3 grid(pointsCount);
784
785             const float idp = 1.0f / dp;
786             const float thetaScale = levels / (2.0f * CV_PI_F);
787
788             GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
789                                                            r_table, r_sizes,
790                                                            hist, rows, cols,
791                                                            minScale, scaleStep, scaleRange,
792                                                            idp, thetaScale);
793             cudaSafeCall( cudaGetLastError() );
794
795             cudaSafeCall( cudaDeviceSynchronize() );
796         }
797
798         __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
799                                                            float4* out, int3* votes, const int maxSize,
800                                                            const float minScale, const float scaleStep, const float dp, const int threshold)
801         {
802             const int x = blockIdx.x * blockDim.x + threadIdx.x;
803             const int y = blockIdx.y * blockDim.y + threadIdx.y;
804
805             if (x >= cols || y >= rows)
806                 return;
807
808             for (int s = 0; s < scaleRange; ++s)
809             {
810                 const float scale = minScale + s * scaleStep;
811
812                 const int prevScaleIdx = (s) * (rows + 2);
813                 const int curScaleIdx = (s + 1) * (rows + 2);
814                 const int nextScaleIdx = (s + 2) * (rows + 2);
815
816                 const int curVotes = hist(curScaleIdx + y + 1, x + 1);
817
818                 if (curVotes > threshold &&
819                     curVotes >  hist(curScaleIdx + y + 1, x) &&
820                     curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
821                     curVotes >  hist(curScaleIdx + y, x + 1) &&
822                     curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
823                     curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
824                     curVotes >= hist(nextScaleIdx + y + 1, x + 1))
825                 {
826                     const int ind = ::atomicAdd(&g_counter, 1);
827
828                     if (ind < maxSize)
829                     {
830                         out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
831                         votes[ind] = make_int3(curVotes, curVotes, 0);
832                     }
833                 }
834             }
835         }
836
837         int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
838                                                    float minScale, float scaleStep, float dp, int threshold)
839         {
840             void* counterPtr;
841             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
842
843             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
844
845             const dim3 block(32, 8);
846             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
847
848             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
849
850             GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
851             cudaSafeCall( cudaGetLastError() );
852
853             cudaSafeCall( cudaDeviceSynchronize() );
854
855             int totalCount;
856             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
857
858             totalCount = ::min(totalCount, maxSize);
859
860             return totalCount;
861         }
862
863         ////////////////////////////////////////////////////////////////////////
864         // GHT_Ballard_PosRotation
865
866         __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
867                                                          PtrStep<short2> r_table, const int* r_sizes,
868                                                          PtrStepi hist, const int rows, const int cols,
869                                                          const float minAngle, const float angleStep, const int angleRange,
870                                                          const float idp, const float thetaScale)
871         {
872             const unsigned int coord = coordList[blockIdx.x];
873             float2 p;
874             p.x = (coord & 0xFFFF);
875             p.y = (coord >> 16) & 0xFFFF;
876
877             const float thetaVal = thetaList[blockIdx.x];
878
879             for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
880             {
881                 const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
882                 float sinA, cosA;
883                 sincosf(angle, &sinA, &cosA);
884
885                 float theta = thetaVal - angle;
886                 if (theta < 0)
887                     theta += 2.0f * CV_PI_F;
888
889                 const int n = __float2int_rn(theta * thetaScale);
890
891                 const short2* r_row = r_table.ptr(n);
892                 const int r_row_size = r_sizes[n];
893
894                 for (int j = 0; j < r_row_size; ++j)
895                 {
896                     const float2 d = saturate_cast<float2>(r_row[j]);
897
898                     const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
899
900                     float2 c = make_float2(p.x - dr.x, p.y - dr.y);
901                     c.x *= idp;
902                     c.y *= idp;
903
904                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
905                         ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
906                 }
907             }
908         }
909
910         void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
911                                                   PtrStepSz<short2> r_table, const int* r_sizes,
912                                                   PtrStepi hist, int rows, int cols,
913                                                   float minAngle, float angleStep, int angleRange,
914                                                   float dp, int levels)
915         {
916             const dim3 block(256);
917             const dim3 grid(pointsCount);
918
919             const float idp = 1.0f / dp;
920             const float thetaScale = levels / (2.0f * CV_PI_F);
921
922             GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
923                                                               r_table, r_sizes,
924                                                               hist, rows, cols,
925                                                               minAngle, angleStep, angleRange,
926                                                               idp, thetaScale);
927             cudaSafeCall( cudaGetLastError() );
928
929             cudaSafeCall( cudaDeviceSynchronize() );
930         }
931
932         __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
933                                                               float4* out, int3* votes, const int maxSize,
934                                                               const float minAngle, const float angleStep, const float dp, const int threshold)
935         {
936             const int x = blockIdx.x * blockDim.x + threadIdx.x;
937             const int y = blockIdx.y * blockDim.y + threadIdx.y;
938
939             if (x >= cols || y >= rows)
940                 return;
941
942             for (int a = 0; a < angleRange; ++a)
943             {
944                 const float angle = minAngle + a * angleStep;
945
946                 const int prevAngleIdx = (a) * (rows + 2);
947                 const int curAngleIdx = (a + 1) * (rows + 2);
948                 const int nextAngleIdx = (a + 2) * (rows + 2);
949
950                 const int curVotes = hist(curAngleIdx + y + 1, x + 1);
951
952                 if (curVotes > threshold &&
953                     curVotes >  hist(curAngleIdx + y + 1, x) &&
954                     curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
955                     curVotes >  hist(curAngleIdx + y, x + 1) &&
956                     curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
957                     curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
958                     curVotes >= hist(nextAngleIdx + y + 1, x + 1))
959                 {
960                     const int ind = ::atomicAdd(&g_counter, 1);
961
962                     if (ind < maxSize)
963                     {
964                         out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
965                         votes[ind] = make_int3(curVotes, 0, curVotes);
966                     }
967                 }
968             }
969         }
970
971         int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
972                                                       float minAngle, float angleStep, float dp, int threshold)
973         {
974             void* counterPtr;
975             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
976
977             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
978
979             const dim3 block(32, 8);
980             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
981
982             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
983
984             GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
985             cudaSafeCall( cudaGetLastError() );
986
987             cudaSafeCall( cudaDeviceSynchronize() );
988
989             int totalCount;
990             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
991
992             totalCount = ::min(totalCount, maxSize);
993
994             return totalCount;
995         }
996
997         ////////////////////////////////////////////////////////////////////////
998         // GHT_Guil_Full
999
1000         struct FeatureTable
1001         {
1002             uchar* p1_pos_data;
1003             size_t p1_pos_step;
1004
1005             uchar* p1_theta_data;
1006             size_t p1_theta_step;
1007
1008             uchar* p2_pos_data;
1009             size_t p2_pos_step;
1010
1011             uchar* d12_data;
1012             size_t d12_step;
1013
1014             uchar* r1_data;
1015             size_t r1_step;
1016
1017             uchar* r2_data;
1018             size_t r2_step;
1019         };
1020
1021         __constant__ FeatureTable c_templFeatures;
1022         __constant__ FeatureTable c_imageFeatures;
1023
1024         void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1025         {
1026             FeatureTable tbl;
1027
1028             tbl.p1_pos_data = p1_pos.data;
1029             tbl.p1_pos_step = p1_pos.step;
1030
1031             tbl.p1_theta_data = p1_theta.data;
1032             tbl.p1_theta_step = p1_theta.step;
1033
1034             tbl.p2_pos_data = p2_pos.data;
1035             tbl.p2_pos_step = p2_pos.step;
1036
1037             tbl.d12_data = d12.data;
1038             tbl.d12_step = d12.step;
1039
1040             tbl.r1_data = r1.data;
1041             tbl.r1_step = r1.step;
1042
1043             tbl.r2_data = r2.data;
1044             tbl.r2_step = r2.step;
1045
1046             cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
1047         }
1048         void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1049         {
1050             FeatureTable tbl;
1051
1052             tbl.p1_pos_data = p1_pos.data;
1053             tbl.p1_pos_step = p1_pos.step;
1054
1055             tbl.p1_theta_data = p1_theta.data;
1056             tbl.p1_theta_step = p1_theta.step;
1057
1058             tbl.p2_pos_data = p2_pos.data;
1059             tbl.p2_pos_step = p2_pos.step;
1060
1061             tbl.d12_data = d12.data;
1062             tbl.d12_step = d12.step;
1063
1064             tbl.r1_data = r1.data;
1065             tbl.r1_step = r1.step;
1066
1067             tbl.r2_data = r2.data;
1068             tbl.r2_step = r2.step;
1069
1070             cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
1071         }
1072
1073         struct TemplFeatureTable
1074         {
1075             static __device__ float2* p1_pos(int n)
1076             {
1077                 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
1078             }
1079             static __device__ float* p1_theta(int n)
1080             {
1081                 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
1082             }
1083             static __device__ float2* p2_pos(int n)
1084             {
1085                 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
1086             }
1087
1088             static __device__ float* d12(int n)
1089             {
1090                 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
1091             }
1092
1093             static __device__ float2* r1(int n)
1094             {
1095                 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
1096             }
1097             static __device__ float2* r2(int n)
1098             {
1099                 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
1100             }
1101         };
1102         struct ImageFeatureTable
1103         {
1104             static __device__ float2* p1_pos(int n)
1105             {
1106                 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
1107             }
1108             static __device__ float* p1_theta(int n)
1109             {
1110                 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
1111             }
1112             static __device__ float2* p2_pos(int n)
1113             {
1114                 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
1115             }
1116
1117             static __device__ float* d12(int n)
1118             {
1119                 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
1120             }
1121
1122             static __device__ float2* r1(int n)
1123             {
1124                 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
1125             }
1126             static __device__ float2* r2(int n)
1127             {
1128                 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
1129             }
1130         };
1131
1132         __device__ float clampAngle(float a)
1133         {
1134             float res = a;
1135
1136             while (res > 2.0f * CV_PI_F)
1137                 res -= 2.0f * CV_PI_F;
1138             while (res < 0.0f)
1139                 res += 2.0f * CV_PI_F;
1140
1141             return res;
1142         }
1143
1144         __device__ bool angleEq(float a, float b, float eps)
1145         {
1146             return (::fabs(clampAngle(a - b)) <= eps);
1147         }
1148
1149         template <class FT, bool isTempl>
1150         __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
1151                                                        int* sizes, const int maxSize,
1152                                                        const float xi, const float angleEpsilon, const float alphaScale,
1153                                                        const float2 center, const float maxDist)
1154         {
1155             const float p1_theta = thetaList[blockIdx.x];
1156             const unsigned int coord1 = coordList[blockIdx.x];
1157             float2 p1_pos;
1158             p1_pos.x = (coord1 & 0xFFFF);
1159             p1_pos.y = (coord1 >> 16) & 0xFFFF;
1160
1161             for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
1162             {
1163                 const float p2_theta = thetaList[i];
1164                 const unsigned int coord2 = coordList[i];
1165                 float2 p2_pos;
1166                 p2_pos.x = (coord2 & 0xFFFF);
1167                 p2_pos.y = (coord2 >> 16) & 0xFFFF;
1168
1169                 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
1170                 {
1171                     const float2 d = p1_pos - p2_pos;
1172
1173                     float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
1174                     float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
1175
1176                     if (d12 > maxDist)
1177                         continue;
1178
1179                     float2 r1 = p1_pos - center;
1180                     float2 r2 = p2_pos - center;
1181
1182                     const int n = __float2int_rn(alpha12 * alphaScale);
1183
1184                     const int ind = ::atomicAdd(sizes + n, 1);
1185
1186                     if (ind < maxSize)
1187                     {
1188                         if (!isTempl)
1189                         {
1190                             FT::p1_pos(n)[ind] = p1_pos;
1191                             FT::p2_pos(n)[ind] = p2_pos;
1192                         }
1193
1194                         FT::p1_theta(n)[ind] = p1_theta;
1195
1196                         FT::d12(n)[ind] = d12;
1197
1198                         if (isTempl)
1199                         {
1200                             FT::r1(n)[ind] = r1;
1201                             FT::r2(n)[ind] = r2;
1202                         }
1203                     }
1204                 }
1205             }
1206         }
1207
1208         template <class FT, bool isTempl>
1209         void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
1210                                                    int* sizes, int maxSize,
1211                                                    float xi, float angleEpsilon, int levels,
1212                                                    float2 center, float maxDist)
1213         {
1214             const dim3 block(256);
1215             const dim3 grid(pointsCount);
1216
1217             const float alphaScale = levels / (2.0f * CV_PI_F);
1218
1219             GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
1220                                                                          sizes, maxSize,
1221                                                                          xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
1222                                                                          center, maxDist);
1223             cudaSafeCall( cudaGetLastError() );
1224
1225             cudaSafeCall( cudaDeviceSynchronize() );
1226
1227             thrust::device_ptr<int> sizesPtr(sizes);
1228             thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
1229         }
1230
1231         void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1232                                                      int* sizes, int maxSize,
1233                                                      float xi, float angleEpsilon, int levels,
1234                                                      float2 center, float maxDist)
1235         {
1236             GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
1237                                                                            sizes, maxSize,
1238                                                                            xi, angleEpsilon, levels,
1239                                                                            center, maxDist);
1240         }
1241         void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1242                                                      int* sizes, int maxSize,
1243                                                      float xi, float angleEpsilon, int levels,
1244                                                      float2 center, float maxDist)
1245         {
1246             GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
1247                                                                             sizes, maxSize,
1248                                                                             xi, angleEpsilon, levels,
1249                                                                             center, maxDist);
1250         }
1251
1252         __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
1253                                                 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
1254         {
1255             extern __shared__ int s_OHist[];
1256             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1257                 s_OHist[i] = 0;
1258             __syncthreads();
1259
1260             const int tIdx = blockIdx.x;
1261             const int level = blockIdx.y;
1262
1263             const int tSize = templSizes[level];
1264
1265             if (tIdx < tSize)
1266             {
1267                 const int imSize = imageSizes[level];
1268
1269                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
1270
1271                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1272                 {
1273                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1274
1275                     const float angle = clampAngle(im_p1_theta - t_p1_theta);
1276
1277                     if (angle >= minAngle && angle <= maxAngle)
1278                     {
1279                         const int n = __float2int_rn((angle - minAngle) * iAngleStep);
1280                         Emulation::smem::atomicAdd(&s_OHist[n], 1);
1281                     }
1282                 }
1283             }
1284             __syncthreads();
1285
1286             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1287                 ::atomicAdd(OHist + i, s_OHist[i]);
1288         }
1289
1290         void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
1291                                          float minAngle, float maxAngle, float angleStep, int angleRange,
1292                                          int levels, int tMaxSize)
1293         {
1294             const dim3 block(256);
1295             const dim3 grid(tMaxSize, levels + 1);
1296
1297             minAngle *= (CV_PI_F / 180.0f);
1298             maxAngle *= (CV_PI_F / 180.0f);
1299             angleStep *= (CV_PI_F / 180.0f);
1300
1301             const size_t smemSize = (angleRange + 1) * sizeof(float);
1302
1303             GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
1304                                                                minAngle, maxAngle, 1.0f / angleStep, angleRange);
1305             cudaSafeCall( cudaGetLastError() );
1306
1307             cudaSafeCall( cudaDeviceSynchronize() );
1308         }
1309
1310         __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
1311                                                 const float angle, const float angleEpsilon,
1312                                                 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
1313         {
1314             extern __shared__ int s_SHist[];
1315             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1316                 s_SHist[i] = 0;
1317             __syncthreads();
1318
1319             const int tIdx = blockIdx.x;
1320             const int level = blockIdx.y;
1321
1322             const int tSize = templSizes[level];
1323
1324             if (tIdx < tSize)
1325             {
1326                 const int imSize = imageSizes[level];
1327
1328                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1329                 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
1330
1331                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1332                 {
1333                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1334                     const float im_d12 = ImageFeatureTable::d12(level)[i];
1335
1336                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1337                     {
1338                         const float scale = im_d12 / t_d12;
1339
1340                         if (scale >= minScale && scale <= maxScale)
1341                         {
1342                             const int s = __float2int_rn((scale - minScale) * iScaleStep);
1343                             Emulation::smem::atomicAdd(&s_SHist[s], 1);
1344                         }
1345                     }
1346                 }
1347             }
1348             __syncthreads();
1349
1350             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1351                 ::atomicAdd(SHist + i, s_SHist[i]);
1352         }
1353
1354         void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
1355                                          float angle, float angleEpsilon,
1356                                          float minScale, float maxScale, float iScaleStep, int scaleRange,
1357                                          int levels, int tMaxSize)
1358         {
1359             const dim3 block(256);
1360             const dim3 grid(tMaxSize, levels + 1);
1361
1362             angle *= (CV_PI_F / 180.0f);
1363             angleEpsilon *= (CV_PI_F / 180.0f);
1364
1365             const size_t smemSize = (scaleRange + 1) * sizeof(float);
1366
1367             GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
1368                                                                angle, angleEpsilon,
1369                                                                minScale, maxScale, iScaleStep, scaleRange);
1370             cudaSafeCall( cudaGetLastError() );
1371
1372             cudaSafeCall( cudaDeviceSynchronize() );
1373         }
1374
1375         __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1376                                                 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
1377                                                 const float idp)
1378         {
1379             const int tIdx = blockIdx.x;
1380             const int level = blockIdx.y;
1381
1382             const int tSize = templSizes[level];
1383
1384             if (tIdx < tSize)
1385             {
1386                 const int imSize = imageSizes[level];
1387
1388                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1389
1390                 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
1391                 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
1392
1393                 r1 = r1 * scale;
1394                 r2 = r2 * scale;
1395
1396                 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
1397                 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
1398
1399                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1400                 {
1401                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1402
1403                     const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
1404                     const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
1405
1406                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1407                     {
1408                         float2 c1, c2;
1409
1410                         c1 = im_p1_pos - r1;
1411                         c1 = c1 * idp;
1412
1413                         c2 = im_p2_pos - r2;
1414                         c2 = c2 * idp;
1415
1416                         if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
1417                             continue;
1418
1419                         if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
1420                             ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
1421                     }
1422                 }
1423             }
1424         }
1425
1426         void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1427                                          float angle, float angleEpsilon, float scale,
1428                                          float dp,
1429                                          int levels, int tMaxSize)
1430         {
1431             const dim3 block(256);
1432             const dim3 grid(tMaxSize, levels + 1);
1433
1434             angle *= (CV_PI_F / 180.0f);
1435             angleEpsilon *= (CV_PI_F / 180.0f);
1436
1437             const float sinVal = ::sinf(angle);
1438             const float cosVal = ::cosf(angle);
1439
1440             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
1441
1442             GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
1443                                                      angle, sinVal, cosVal, angleEpsilon, scale,
1444                                                      1.0f / dp);
1445             cudaSafeCall( cudaGetLastError() );
1446
1447             cudaSafeCall( cudaDeviceSynchronize() );
1448         }
1449
1450         __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
1451                                                     const float angle, const int angleVotes, const float scale, const int scaleVotes,
1452                                                     const float dp, const int threshold)
1453         {
1454             const int x = blockIdx.x * blockDim.x + threadIdx.x;
1455             const int y = blockIdx.y * blockDim.y + threadIdx.y;
1456
1457             if (x >= hist.cols - 2 || y >= hist.rows - 2)
1458                 return;
1459
1460             const int curVotes = hist(y + 1, x + 1);
1461
1462             if (curVotes > threshold &&
1463                 curVotes >  hist(y + 1, x) &&
1464                 curVotes >= hist(y + 1, x + 2) &&
1465                 curVotes >  hist(y, x + 1) &&
1466                 curVotes >= hist(y + 2, x + 1))
1467             {
1468                 const int ind = ::atomicAdd(&g_counter, 1);
1469
1470                 if (ind < maxSize)
1471                 {
1472                     out[ind] = make_float4(x * dp, y * dp, scale, angle);
1473                     votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
1474                 }
1475             }
1476         }
1477
1478         int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
1479                                              float angle, int angleVotes, float scale, int scaleVotes,
1480                                              float dp, int threshold)
1481         {
1482             void* counterPtr;
1483             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
1484
1485             cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
1486
1487             const dim3 block(32, 8);
1488             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
1489
1490             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
1491
1492             GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
1493                                                          angle, angleVotes, scale, scaleVotes,
1494                                                          dp, threshold);
1495             cudaSafeCall( cudaGetLastError() );
1496
1497             cudaSafeCall( cudaDeviceSynchronize() );
1498
1499             int totalCount;
1500             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
1501
1502             totalCount = ::min(totalCount, maxSize);
1503
1504             return totalCount;
1505         }
1506     }
1507 }}}