added dual tvl1 optical flow gpu implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / hough.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include <thrust/device_ptr.h>
46 #include <thrust/sort.h>
47
48 #include "opencv2/gpu/device/common.hpp"
49 #include "opencv2/gpu/device/emulation.hpp"
50 #include "opencv2/gpu/device/vec_math.hpp"
51 #include "opencv2/gpu/device/limits.hpp"
52 #include "opencv2/gpu/device/dynamic_smem.hpp"
53
54 namespace cv { namespace gpu { namespace device
55 {
56     namespace hough
57     {
58         __device__ int g_counter;
59
60         ////////////////////////////////////////////////////////////////////////
61         // buildPointList
62
63         template <int PIXELS_PER_THREAD>
64         __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
65         {
66             __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
67             __shared__ int s_qsize[4];
68             __shared__ int s_globStart[4];
69
70             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
71             const int y = blockIdx.y * blockDim.y + threadIdx.y;
72
73             if (threadIdx.x == 0)
74                 s_qsize[threadIdx.y] = 0;
75             __syncthreads();
76
77             if (y < src.rows)
78             {
79                 // fill the queue
80                 const uchar* srcRow = src.ptr(y);
81                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
82                 {
83                     if (srcRow[xx])
84                     {
85                         const unsigned int val = (y << 16) | xx;
86                         const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
87                         s_queues[threadIdx.y][qidx] = val;
88                     }
89                 }
90             }
91
92             __syncthreads();
93
94             // let one thread reserve the space required in the global list
95             if (threadIdx.x == 0 && threadIdx.y == 0)
96             {
97                 // find how many items are stored in each list
98                 int totalSize = 0;
99                 for (int i = 0; i < blockDim.y; ++i)
100                 {
101                     s_globStart[i] = totalSize;
102                     totalSize += s_qsize[i];
103                 }
104
105                 // calculate the offset in the global list
106                 const int globalOffset = atomicAdd(&g_counter, totalSize);
107                 for (int i = 0; i < blockDim.y; ++i)
108                     s_globStart[i] += globalOffset;
109             }
110
111             __syncthreads();
112
113             // copy local queues to global queue
114             const int qsize = s_qsize[threadIdx.y];
115             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
116             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
117                 list[gidx] = s_queues[threadIdx.y][i];
118         }
119
120         int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
121         {
122             const int PIXELS_PER_THREAD = 16;
123
124             void* counterPtr;
125             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
126
127             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
128
129             const dim3 block(32, 4);
130             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
131
132             cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
133
134             buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
135             cudaSafeCall( cudaGetLastError() );
136
137             cudaSafeCall( cudaDeviceSynchronize() );
138
139             int totalCount;
140             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
141
142             return totalCount;
143         }
144
145         ////////////////////////////////////////////////////////////////////////
146         // linesAccum
147
148         __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
149         {
150             const int n = blockIdx.x;
151             const float ang = n * theta;
152
153             float sinVal;
154             float cosVal;
155             sincosf(ang, &sinVal, &cosVal);
156             sinVal *= irho;
157             cosVal *= irho;
158
159             const int shift = (numrho - 1) / 2;
160
161             int* accumRow = accum.ptr(n + 1);
162             for (int i = threadIdx.x; i < count; i += blockDim.x)
163             {
164                 const unsigned int val = list[i];
165
166                 const int x = (val & 0xFFFF);
167                 const int y = (val >> 16) & 0xFFFF;
168
169                 int r = __float2int_rn(x * cosVal + y * sinVal);
170                 r += shift;
171
172                 ::atomicAdd(accumRow + r + 1, 1);
173             }
174         }
175
176         __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
177         {
178             int* smem = DynamicSharedMem<int>();
179
180             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
181                 smem[i] = 0;
182
183             __syncthreads();
184
185             const int n = blockIdx.x;
186             const float ang = n * theta;
187
188             float sinVal;
189             float cosVal;
190             sincosf(ang, &sinVal, &cosVal);
191             sinVal *= irho;
192             cosVal *= irho;
193
194             const int shift = (numrho - 1) / 2;
195
196             for (int i = threadIdx.x; i < count; i += blockDim.x)
197             {
198                 const unsigned int val = list[i];
199
200                 const int x = (val & 0xFFFF);
201                 const int y = (val >> 16) & 0xFFFF;
202
203                 int r = __float2int_rn(x * cosVal + y * sinVal);
204                 r += shift;
205
206                 Emulation::smem::atomicAdd(&smem[r + 1], 1);
207             }
208
209             __syncthreads();
210
211             int* accumRow = accum.ptr(n + 1);
212             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
213                 accumRow[i] = smem[i];
214         }
215
216         void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
217         {
218             const dim3 block(has20 ? 1024 : 512);
219             const dim3 grid(accum.rows - 2);
220
221             size_t smemSize = (accum.cols - 1) * sizeof(int);
222
223             if (smemSize < sharedMemPerBlock - 1000)
224                 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
225             else
226                 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
227
228             cudaSafeCall( cudaGetLastError() );
229
230             cudaSafeCall( cudaDeviceSynchronize() );
231         }
232
233         ////////////////////////////////////////////////////////////////////////
234         // linesGetResult
235
236         __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
237         {
238             const int r = blockIdx.x * blockDim.x + threadIdx.x;
239             const int n = blockIdx.y * blockDim.y + threadIdx.y;
240
241             if (r >= accum.cols - 2 || n >= accum.rows - 2)
242                 return;
243
244             const int curVotes = accum(n + 1, r + 1);
245
246             if (curVotes > threshold &&
247                 curVotes >  accum(n + 1, r) &&
248                 curVotes >= accum(n + 1, r + 2) &&
249                 curVotes >  accum(n, r + 1) &&
250                 curVotes >= accum(n + 2, r + 1))
251             {
252                 const float radius = (r - (numrho - 1) * 0.5f) * rho;
253                 const float angle = n * theta;
254
255                 const int ind = ::atomicAdd(&g_counter, 1);
256                 if (ind < maxSize)
257                 {
258                     out[ind] = make_float2(radius, angle);
259                     votes[ind] = curVotes;
260                 }
261             }
262         }
263
264         int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
265         {
266             void* counterPtr;
267             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
268
269             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
270
271             const dim3 block(32, 8);
272             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
273
274             cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
275
276             linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
277             cudaSafeCall( cudaGetLastError() );
278
279             cudaSafeCall( cudaDeviceSynchronize() );
280
281             int totalCount;
282             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
283
284             totalCount = ::min(totalCount, maxSize);
285
286             if (doSort && totalCount > 0)
287             {
288                 thrust::device_ptr<float2> outPtr(out);
289                 thrust::device_ptr<int> votesPtr(votes);
290                 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
291             }
292
293             return totalCount;
294         }
295
296         ////////////////////////////////////////////////////////////////////////
297         // circlesAccumCenters
298
299         __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
300                                             PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
301         {
302             const int SHIFT = 10;
303             const int ONE = 1 << SHIFT;
304
305             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
306
307             if (tid >= count)
308                 return;
309
310             const unsigned int val = list[tid];
311
312             const int x = (val & 0xFFFF);
313             const int y = (val >> 16) & 0xFFFF;
314
315             const int vx = dx(y, x);
316             const int vy = dy(y, x);
317
318             if (vx == 0 && vy == 0)
319                 return;
320
321             const float mag = ::sqrtf(vx * vx + vy * vy);
322
323             const int x0 = __float2int_rn((x * idp) * ONE);
324             const int y0 = __float2int_rn((y * idp) * ONE);
325
326             int sx = __float2int_rn((vx * idp) * ONE / mag);
327             int sy = __float2int_rn((vy * idp) * ONE / mag);
328
329             // Step from minRadius to maxRadius in both directions of the gradient
330             for (int k1 = 0; k1 < 2; ++k1)
331             {
332                 int x1 = x0 + minRadius * sx;
333                 int y1 = y0 + minRadius * sy;
334
335                 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
336                 {
337                     const int x2 = x1 >> SHIFT;
338                     const int y2 = y1 >> SHIFT;
339
340                     if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
341                         break;
342
343                     ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
344                 }
345
346                 sx = -sx;
347                 sy = -sy;
348             }
349         }
350
351         void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
352         {
353             const dim3 block(256);
354             const dim3 grid(divUp(count, block.x));
355
356             cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
357
358             circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
359             cudaSafeCall( cudaGetLastError() );
360
361             cudaSafeCall( cudaDeviceSynchronize() );
362         }
363
364         ////////////////////////////////////////////////////////////////////////
365         // buildCentersList
366
367         __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
368         {
369             const int x = blockIdx.x * blockDim.x + threadIdx.x;
370             const int y = blockIdx.y * blockDim.y + threadIdx.y;
371
372             if (x < accum.cols - 2 && y < accum.rows - 2)
373             {
374                 const int top = accum(y, x + 1);
375
376                 const int left = accum(y + 1, x);
377                 const int cur = accum(y + 1, x + 1);
378                 const int right = accum(y + 1, x + 2);
379
380                 const int bottom = accum(y + 2, x + 1);
381
382                 if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
383                 {
384                     const unsigned int val = (y << 16) | x;
385                     const int idx = ::atomicAdd(&g_counter, 1);
386                     centers[idx] = val;
387                 }
388             }
389         }
390
391         int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
392         {
393             void* counterPtr;
394             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
395
396             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
397
398             const dim3 block(32, 8);
399             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
400
401             cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
402
403             buildCentersList<<<grid, block>>>(accum, centers, threshold);
404             cudaSafeCall( cudaGetLastError() );
405
406             cudaSafeCall( cudaDeviceSynchronize() );
407
408             int totalCount;
409             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
410
411             return totalCount;
412         }
413
414         ////////////////////////////////////////////////////////////////////////
415         // circlesAccumRadius
416
417         __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
418                                            float3* circles, const int maxCircles, const float dp,
419                                            const int minRadius, const int maxRadius, const int histSize, const int threshold)
420         {
421             int* smem = DynamicSharedMem<int>();
422
423             for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
424                 smem[i] = 0;
425             __syncthreads();
426
427             unsigned int val = centers[blockIdx.x];
428
429             float cx = (val & 0xFFFF);
430             float cy = (val >> 16) & 0xFFFF;
431
432             cx = (cx + 0.5f) * dp;
433             cy = (cy + 0.5f) * dp;
434
435             for (int i = threadIdx.x; i < count; i += blockDim.x)
436             {
437                 val = list[i];
438
439                 const int x = (val & 0xFFFF);
440                 const int y = (val >> 16) & 0xFFFF;
441
442                 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
443                 if (rad >= minRadius && rad <= maxRadius)
444                 {
445                     const int r = __float2int_rn(rad - minRadius);
446
447                     Emulation::smem::atomicAdd(&smem[r + 1], 1);
448                 }
449             }
450
451             __syncthreads();
452
453             for (int i = threadIdx.x; i < histSize; i += blockDim.x)
454             {
455                 const int curVotes = smem[i + 1];
456
457                 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
458                 {
459                     const int ind = ::atomicAdd(&g_counter, 1);
460                     if (ind < maxCircles)
461                         circles[ind] = make_float3(cx, cy, i + minRadius);
462                 }
463             }
464         }
465
466         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
467                                    float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
468         {
469             void* counterPtr;
470             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
471
472             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
473
474             const dim3 block(has20 ? 1024 : 512);
475             const dim3 grid(centersCount);
476
477             const int histSize = maxRadius - minRadius + 1;
478             size_t smemSize = (histSize + 2) * sizeof(int);
479
480             circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
481             cudaSafeCall( cudaGetLastError() );
482
483             cudaSafeCall( cudaDeviceSynchronize() );
484
485             int totalCount;
486             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
487
488             totalCount = ::min(totalCount, maxCircles);
489
490             return totalCount;
491         }
492
493         ////////////////////////////////////////////////////////////////////////
494         // Generalized Hough
495
496         template <typename T, int PIXELS_PER_THREAD>
497         __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
498         {
499             __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
500             __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
501             __shared__ int s_sizes[4];
502             __shared__ int s_globStart[4];
503
504             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
505             const int y = blockIdx.y * blockDim.y + threadIdx.y;
506
507             if (threadIdx.x == 0)
508                 s_sizes[threadIdx.y] = 0;
509             __syncthreads();
510
511             if (y < edges.rows)
512             {
513                 // fill the queue
514                 const uchar* edgesRow = edges.ptr(y);
515                 const T* dxRow = dx.ptr(y);
516                 const T* dyRow = dy.ptr(y);
517
518                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
519                 {
520                     const T dxVal = dxRow[xx];
521                     const T dyVal = dyRow[xx];
522
523                     if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
524                     {
525                         const unsigned int coord = (y << 16) | xx;
526
527                         float theta = ::atan2f(dyVal, dxVal);
528                         if (theta < 0)
529                             theta += 2.0f * CV_PI_F;
530
531                         const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
532
533                         s_coordLists[threadIdx.y][qidx] = coord;
534                         s_thetaLists[threadIdx.y][qidx] = theta;
535                     }
536                 }
537             }
538
539             __syncthreads();
540
541             // let one thread reserve the space required in the global list
542             if (threadIdx.x == 0 && threadIdx.y == 0)
543             {
544                 // find how many items are stored in each list
545                 int totalSize = 0;
546                 for (int i = 0; i < blockDim.y; ++i)
547                 {
548                     s_globStart[i] = totalSize;
549                     totalSize += s_sizes[i];
550                 }
551
552                 // calculate the offset in the global list
553                 const int globalOffset = atomicAdd(&g_counter, totalSize);
554                 for (int i = 0; i < blockDim.y; ++i)
555                     s_globStart[i] += globalOffset;
556             }
557
558             __syncthreads();
559
560             // copy local queues to global queue
561             const int qsize = s_sizes[threadIdx.y];
562             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
563             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
564             {
565                 coordList[gidx] = s_coordLists[threadIdx.y][i];
566                 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
567             }
568         }
569
570         template <typename T>
571         int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
572         {
573             const int PIXELS_PER_THREAD = 8;
574
575             void* counterPtr;
576             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
577
578             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
579
580             const dim3 block(32, 4);
581             const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
582
583             cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
584
585             buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
586             cudaSafeCall( cudaGetLastError() );
587
588             cudaSafeCall( cudaDeviceSynchronize() );
589
590             int totalCount;
591             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
592
593             return totalCount;
594         }
595
596         template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
597         template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
598         template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
599
600         __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
601                                     PtrStep<short2> r_table, int* r_sizes, int maxSize,
602                                     const short2 templCenter, const float thetaScale)
603         {
604             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
605
606             if (tid >= pointsCount)
607                 return;
608
609             const unsigned int coord = coordList[tid];
610             short2 p;
611             p.x = (coord & 0xFFFF);
612             p.y = (coord >> 16) & 0xFFFF;
613
614             const float theta = thetaList[tid];
615             const int n = __float2int_rn(theta * thetaScale);
616
617             const int ind = ::atomicAdd(r_sizes + n, 1);
618             if (ind < maxSize)
619                 r_table(n, ind) = p - templCenter;
620         }
621
622         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
623                              PtrStepSz<short2> r_table, int* r_sizes,
624                              short2 templCenter, int levels)
625         {
626             const dim3 block(256);
627             const dim3 grid(divUp(pointsCount, block.x));
628
629             const float thetaScale = levels / (2.0f * CV_PI_F);
630
631             buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
632             cudaSafeCall( cudaGetLastError() );
633
634             cudaSafeCall( cudaDeviceSynchronize() );
635         }
636
637         ////////////////////////////////////////////////////////////////////////
638         // GHT_Ballard_Pos
639
640         __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
641                                                  const PtrStep<short2> r_table, const int* r_sizes,
642                                                  PtrStepSzi hist,
643                                                  const float idp, const float thetaScale)
644         {
645             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
646
647             if (tid >= pointsCount)
648                 return;
649
650             const unsigned int coord = coordList[tid];
651             short2 p;
652             p.x = (coord & 0xFFFF);
653             p.y = (coord >> 16) & 0xFFFF;
654
655             const float theta = thetaList[tid];
656             const int n = __float2int_rn(theta * thetaScale);
657
658             const short2* r_row = r_table.ptr(n);
659             const int r_row_size = r_sizes[n];
660
661             for (int j = 0; j < r_row_size; ++j)
662             {
663                 short2 c = p - r_row[j];
664
665                 c.x = __float2int_rn(c.x * idp);
666                 c.y = __float2int_rn(c.y * idp);
667
668                 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
669                     ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
670             }
671         }
672
673         void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
674                                           PtrStepSz<short2> r_table, const int* r_sizes,
675                                           PtrStepSzi hist,
676                                           float dp, int levels)
677         {
678             const dim3 block(256);
679             const dim3 grid(divUp(pointsCount, block.x));
680
681             const float idp = 1.0f / dp;
682             const float thetaScale = levels / (2.0f * CV_PI_F);
683
684             GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
685             cudaSafeCall( cudaGetLastError() );
686
687             cudaSafeCall( cudaDeviceSynchronize() );
688         }
689
690         __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
691         {
692             const int x = blockIdx.x * blockDim.x + threadIdx.x;
693             const int y = blockIdx.y * blockDim.y + threadIdx.y;
694
695             if (x >= hist.cols - 2 || y >= hist.rows - 2)
696                 return;
697
698             const int curVotes = hist(y + 1, x + 1);
699
700             if (curVotes > threshold &&
701                 curVotes >  hist(y + 1, x) &&
702                 curVotes >= hist(y + 1, x + 2) &&
703                 curVotes >  hist(y, x + 1) &&
704                 curVotes >= hist(y + 2, x + 1))
705             {
706                 const int ind = ::atomicAdd(&g_counter, 1);
707
708                 if (ind < maxSize)
709                 {
710                     out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
711                     votes[ind] = make_int3(curVotes, 0, 0);
712                 }
713             }
714         }
715
716         int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
717         {
718             void* counterPtr;
719             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
720
721             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
722
723             const dim3 block(32, 8);
724             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
725
726             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
727
728             GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
729             cudaSafeCall( cudaGetLastError() );
730
731             cudaSafeCall( cudaDeviceSynchronize() );
732
733             int totalCount;
734             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
735
736             totalCount = ::min(totalCount, maxSize);
737
738             return totalCount;
739         }
740
741         ////////////////////////////////////////////////////////////////////////
742         // GHT_Ballard_PosScale
743
744         __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
745                                                       PtrStep<short2> r_table, const int* r_sizes,
746                                                       PtrStepi hist, const int rows, const int cols,
747                                                       const float minScale, const float scaleStep, const int scaleRange,
748                                                       const float idp, const float thetaScale)
749         {
750             const unsigned int coord = coordList[blockIdx.x];
751             float2 p;
752             p.x = (coord & 0xFFFF);
753             p.y = (coord >> 16) & 0xFFFF;
754
755             const float theta = thetaList[blockIdx.x];
756             const int n = __float2int_rn(theta * thetaScale);
757
758             const short2* r_row = r_table.ptr(n);
759             const int r_row_size = r_sizes[n];
760
761             for (int j = 0; j < r_row_size; ++j)
762             {
763                 const float2 d = saturate_cast<float2>(r_row[j]);
764
765                 for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
766                 {
767                     const float scale = minScale + s * scaleStep;
768
769                     float2 c = p - scale * d;
770
771                     c.x *= idp;
772                     c.y *= idp;
773
774                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
775                         ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
776                 }
777             }
778         }
779
780         void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
781                                                PtrStepSz<short2> r_table, const int* r_sizes,
782                                                PtrStepi hist, int rows, int cols,
783                                                float minScale, float scaleStep, int scaleRange,
784                                                float dp, int levels)
785         {
786             const dim3 block(256);
787             const dim3 grid(pointsCount);
788
789             const float idp = 1.0f / dp;
790             const float thetaScale = levels / (2.0f * CV_PI_F);
791
792             GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
793                                                            r_table, r_sizes,
794                                                            hist, rows, cols,
795                                                            minScale, scaleStep, scaleRange,
796                                                            idp, thetaScale);
797             cudaSafeCall( cudaGetLastError() );
798
799             cudaSafeCall( cudaDeviceSynchronize() );
800         }
801
802         __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
803                                                            float4* out, int3* votes, const int maxSize,
804                                                            const float minScale, const float scaleStep, const float dp, const int threshold)
805         {
806             const int x = blockIdx.x * blockDim.x + threadIdx.x;
807             const int y = blockIdx.y * blockDim.y + threadIdx.y;
808
809             if (x >= cols || y >= rows)
810                 return;
811
812             for (int s = 0; s < scaleRange; ++s)
813             {
814                 const float scale = minScale + s * scaleStep;
815
816                 const int prevScaleIdx = (s) * (rows + 2);
817                 const int curScaleIdx = (s + 1) * (rows + 2);
818                 const int nextScaleIdx = (s + 2) * (rows + 2);
819
820                 const int curVotes = hist(curScaleIdx + y + 1, x + 1);
821
822                 if (curVotes > threshold &&
823                     curVotes >  hist(curScaleIdx + y + 1, x) &&
824                     curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
825                     curVotes >  hist(curScaleIdx + y, x + 1) &&
826                     curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
827                     curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
828                     curVotes >= hist(nextScaleIdx + y + 1, x + 1))
829                 {
830                     const int ind = ::atomicAdd(&g_counter, 1);
831
832                     if (ind < maxSize)
833                     {
834                         out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
835                         votes[ind] = make_int3(curVotes, curVotes, 0);
836                     }
837                 }
838             }
839         }
840
841         int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
842                                                    float minScale, float scaleStep, float dp, int threshold)
843         {
844             void* counterPtr;
845             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
846
847             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
848
849             const dim3 block(32, 8);
850             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
851
852             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
853
854             GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
855             cudaSafeCall( cudaGetLastError() );
856
857             cudaSafeCall( cudaDeviceSynchronize() );
858
859             int totalCount;
860             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
861
862             totalCount = ::min(totalCount, maxSize);
863
864             return totalCount;
865         }
866
867         ////////////////////////////////////////////////////////////////////////
868         // GHT_Ballard_PosRotation
869
870         __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
871                                                          PtrStep<short2> r_table, const int* r_sizes,
872                                                          PtrStepi hist, const int rows, const int cols,
873                                                          const float minAngle, const float angleStep, const int angleRange,
874                                                          const float idp, const float thetaScale)
875         {
876             const unsigned int coord = coordList[blockIdx.x];
877             float2 p;
878             p.x = (coord & 0xFFFF);
879             p.y = (coord >> 16) & 0xFFFF;
880
881             const float thetaVal = thetaList[blockIdx.x];
882
883             for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
884             {
885                 const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
886                 float sinA, cosA;
887                 sincosf(angle, &sinA, &cosA);
888
889                 float theta = thetaVal - angle;
890                 if (theta < 0)
891                     theta += 2.0f * CV_PI_F;
892
893                 const int n = __float2int_rn(theta * thetaScale);
894
895                 const short2* r_row = r_table.ptr(n);
896                 const int r_row_size = r_sizes[n];
897
898                 for (int j = 0; j < r_row_size; ++j)
899                 {
900                     const float2 d = saturate_cast<float2>(r_row[j]);
901
902                     const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
903
904                     float2 c = make_float2(p.x - dr.x, p.y - dr.y);
905                     c.x *= idp;
906                     c.y *= idp;
907
908                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
909                         ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
910                 }
911             }
912         }
913
914         void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
915                                                   PtrStepSz<short2> r_table, const int* r_sizes,
916                                                   PtrStepi hist, int rows, int cols,
917                                                   float minAngle, float angleStep, int angleRange,
918                                                   float dp, int levels)
919         {
920             const dim3 block(256);
921             const dim3 grid(pointsCount);
922
923             const float idp = 1.0f / dp;
924             const float thetaScale = levels / (2.0f * CV_PI_F);
925
926             GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
927                                                               r_table, r_sizes,
928                                                               hist, rows, cols,
929                                                               minAngle, angleStep, angleRange,
930                                                               idp, thetaScale);
931             cudaSafeCall( cudaGetLastError() );
932
933             cudaSafeCall( cudaDeviceSynchronize() );
934         }
935
936         __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
937                                                               float4* out, int3* votes, const int maxSize,
938                                                               const float minAngle, const float angleStep, const float dp, const int threshold)
939         {
940             const int x = blockIdx.x * blockDim.x + threadIdx.x;
941             const int y = blockIdx.y * blockDim.y + threadIdx.y;
942
943             if (x >= cols || y >= rows)
944                 return;
945
946             for (int a = 0; a < angleRange; ++a)
947             {
948                 const float angle = minAngle + a * angleStep;
949
950                 const int prevAngleIdx = (a) * (rows + 2);
951                 const int curAngleIdx = (a + 1) * (rows + 2);
952                 const int nextAngleIdx = (a + 2) * (rows + 2);
953
954                 const int curVotes = hist(curAngleIdx + y + 1, x + 1);
955
956                 if (curVotes > threshold &&
957                     curVotes >  hist(curAngleIdx + y + 1, x) &&
958                     curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
959                     curVotes >  hist(curAngleIdx + y, x + 1) &&
960                     curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
961                     curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
962                     curVotes >= hist(nextAngleIdx + y + 1, x + 1))
963                 {
964                     const int ind = ::atomicAdd(&g_counter, 1);
965
966                     if (ind < maxSize)
967                     {
968                         out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
969                         votes[ind] = make_int3(curVotes, 0, curVotes);
970                     }
971                 }
972             }
973         }
974
975         int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
976                                                       float minAngle, float angleStep, float dp, int threshold)
977         {
978             void* counterPtr;
979             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
980
981             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
982
983             const dim3 block(32, 8);
984             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
985
986             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
987
988             GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
989             cudaSafeCall( cudaGetLastError() );
990
991             cudaSafeCall( cudaDeviceSynchronize() );
992
993             int totalCount;
994             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
995
996             totalCount = ::min(totalCount, maxSize);
997
998             return totalCount;
999         }
1000
1001         ////////////////////////////////////////////////////////////////////////
1002         // GHT_Guil_Full
1003
1004         struct FeatureTable
1005         {
1006             uchar* p1_pos_data;
1007             size_t p1_pos_step;
1008
1009             uchar* p1_theta_data;
1010             size_t p1_theta_step;
1011
1012             uchar* p2_pos_data;
1013             size_t p2_pos_step;
1014
1015             uchar* d12_data;
1016             size_t d12_step;
1017
1018             uchar* r1_data;
1019             size_t r1_step;
1020
1021             uchar* r2_data;
1022             size_t r2_step;
1023         };
1024
1025         __constant__ FeatureTable c_templFeatures;
1026         __constant__ FeatureTable c_imageFeatures;
1027
1028         void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1029         {
1030             FeatureTable tbl;
1031
1032             tbl.p1_pos_data = p1_pos.data;
1033             tbl.p1_pos_step = p1_pos.step;
1034
1035             tbl.p1_theta_data = p1_theta.data;
1036             tbl.p1_theta_step = p1_theta.step;
1037
1038             tbl.p2_pos_data = p2_pos.data;
1039             tbl.p2_pos_step = p2_pos.step;
1040
1041             tbl.d12_data = d12.data;
1042             tbl.d12_step = d12.step;
1043
1044             tbl.r1_data = r1.data;
1045             tbl.r1_step = r1.step;
1046
1047             tbl.r2_data = r2.data;
1048             tbl.r2_step = r2.step;
1049
1050             cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
1051         }
1052         void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1053         {
1054             FeatureTable tbl;
1055
1056             tbl.p1_pos_data = p1_pos.data;
1057             tbl.p1_pos_step = p1_pos.step;
1058
1059             tbl.p1_theta_data = p1_theta.data;
1060             tbl.p1_theta_step = p1_theta.step;
1061
1062             tbl.p2_pos_data = p2_pos.data;
1063             tbl.p2_pos_step = p2_pos.step;
1064
1065             tbl.d12_data = d12.data;
1066             tbl.d12_step = d12.step;
1067
1068             tbl.r1_data = r1.data;
1069             tbl.r1_step = r1.step;
1070
1071             tbl.r2_data = r2.data;
1072             tbl.r2_step = r2.step;
1073
1074             cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
1075         }
1076
1077         struct TemplFeatureTable
1078         {
1079             static __device__ float2* p1_pos(int n)
1080             {
1081                 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
1082             }
1083             static __device__ float* p1_theta(int n)
1084             {
1085                 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
1086             }
1087             static __device__ float2* p2_pos(int n)
1088             {
1089                 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
1090             }
1091
1092             static __device__ float* d12(int n)
1093             {
1094                 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
1095             }
1096
1097             static __device__ float2* r1(int n)
1098             {
1099                 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
1100             }
1101             static __device__ float2* r2(int n)
1102             {
1103                 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
1104             }
1105         };
1106         struct ImageFeatureTable
1107         {
1108             static __device__ float2* p1_pos(int n)
1109             {
1110                 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
1111             }
1112             static __device__ float* p1_theta(int n)
1113             {
1114                 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
1115             }
1116             static __device__ float2* p2_pos(int n)
1117             {
1118                 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
1119             }
1120
1121             static __device__ float* d12(int n)
1122             {
1123                 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
1124             }
1125
1126             static __device__ float2* r1(int n)
1127             {
1128                 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
1129             }
1130             static __device__ float2* r2(int n)
1131             {
1132                 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
1133             }
1134         };
1135
1136         __device__ float clampAngle(float a)
1137         {
1138             float res = a;
1139
1140             while (res > 2.0f * CV_PI_F)
1141                 res -= 2.0f * CV_PI_F;
1142             while (res < 0.0f)
1143                 res += 2.0f * CV_PI_F;
1144
1145             return res;
1146         }
1147
1148         __device__ bool angleEq(float a, float b, float eps)
1149         {
1150             return (::fabs(clampAngle(a - b)) <= eps);
1151         }
1152
1153         template <class FT, bool isTempl>
1154         __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
1155                                                        int* sizes, const int maxSize,
1156                                                        const float xi, const float angleEpsilon, const float alphaScale,
1157                                                        const float2 center, const float maxDist)
1158         {
1159             const float p1_theta = thetaList[blockIdx.x];
1160             const unsigned int coord1 = coordList[blockIdx.x];
1161             float2 p1_pos;
1162             p1_pos.x = (coord1 & 0xFFFF);
1163             p1_pos.y = (coord1 >> 16) & 0xFFFF;
1164
1165             for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
1166             {
1167                 const float p2_theta = thetaList[i];
1168                 const unsigned int coord2 = coordList[i];
1169                 float2 p2_pos;
1170                 p2_pos.x = (coord2 & 0xFFFF);
1171                 p2_pos.y = (coord2 >> 16) & 0xFFFF;
1172
1173                 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
1174                 {
1175                     const float2 d = p1_pos - p2_pos;
1176
1177                     float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
1178                     float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
1179
1180                     if (d12 > maxDist)
1181                         continue;
1182
1183                     float2 r1 = p1_pos - center;
1184                     float2 r2 = p2_pos - center;
1185
1186                     const int n = __float2int_rn(alpha12 * alphaScale);
1187
1188                     const int ind = ::atomicAdd(sizes + n, 1);
1189
1190                     if (ind < maxSize)
1191                     {
1192                         if (!isTempl)
1193                         {
1194                             FT::p1_pos(n)[ind] = p1_pos;
1195                             FT::p2_pos(n)[ind] = p2_pos;
1196                         }
1197
1198                         FT::p1_theta(n)[ind] = p1_theta;
1199
1200                         FT::d12(n)[ind] = d12;
1201
1202                         if (isTempl)
1203                         {
1204                             FT::r1(n)[ind] = r1;
1205                             FT::r2(n)[ind] = r2;
1206                         }
1207                     }
1208                 }
1209             }
1210         }
1211
1212         template <class FT, bool isTempl>
1213         void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
1214                                                    int* sizes, int maxSize,
1215                                                    float xi, float angleEpsilon, int levels,
1216                                                    float2 center, float maxDist)
1217         {
1218             const dim3 block(256);
1219             const dim3 grid(pointsCount);
1220
1221             const float alphaScale = levels / (2.0f * CV_PI_F);
1222
1223             GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
1224                                                                          sizes, maxSize,
1225                                                                          xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
1226                                                                          center, maxDist);
1227             cudaSafeCall( cudaGetLastError() );
1228
1229             cudaSafeCall( cudaDeviceSynchronize() );
1230
1231             thrust::device_ptr<int> sizesPtr(sizes);
1232             thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
1233         }
1234
1235         void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1236                                                      int* sizes, int maxSize,
1237                                                      float xi, float angleEpsilon, int levels,
1238                                                      float2 center, float maxDist)
1239         {
1240             GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
1241                                                                            sizes, maxSize,
1242                                                                            xi, angleEpsilon, levels,
1243                                                                            center, maxDist);
1244         }
1245         void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1246                                                      int* sizes, int maxSize,
1247                                                      float xi, float angleEpsilon, int levels,
1248                                                      float2 center, float maxDist)
1249         {
1250             GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
1251                                                                             sizes, maxSize,
1252                                                                             xi, angleEpsilon, levels,
1253                                                                             center, maxDist);
1254         }
1255
1256         __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
1257                                                 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
1258         {
1259             extern __shared__ int s_OHist[];
1260             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1261                 s_OHist[i] = 0;
1262             __syncthreads();
1263
1264             const int tIdx = blockIdx.x;
1265             const int level = blockIdx.y;
1266
1267             const int tSize = templSizes[level];
1268
1269             if (tIdx < tSize)
1270             {
1271                 const int imSize = imageSizes[level];
1272
1273                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
1274
1275                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1276                 {
1277                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1278
1279                     const float angle = clampAngle(im_p1_theta - t_p1_theta);
1280
1281                     if (angle >= minAngle && angle <= maxAngle)
1282                     {
1283                         const int n = __float2int_rn((angle - minAngle) * iAngleStep);
1284                         Emulation::smem::atomicAdd(&s_OHist[n], 1);
1285                     }
1286                 }
1287             }
1288             __syncthreads();
1289
1290             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1291                 ::atomicAdd(OHist + i, s_OHist[i]);
1292         }
1293
1294         void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
1295                                          float minAngle, float maxAngle, float angleStep, int angleRange,
1296                                          int levels, int tMaxSize)
1297         {
1298             const dim3 block(256);
1299             const dim3 grid(tMaxSize, levels + 1);
1300
1301             minAngle *= (CV_PI_F / 180.0f);
1302             maxAngle *= (CV_PI_F / 180.0f);
1303             angleStep *= (CV_PI_F / 180.0f);
1304
1305             const size_t smemSize = (angleRange + 1) * sizeof(float);
1306
1307             GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
1308                                                                minAngle, maxAngle, 1.0f / angleStep, angleRange);
1309             cudaSafeCall( cudaGetLastError() );
1310
1311             cudaSafeCall( cudaDeviceSynchronize() );
1312         }
1313
1314         __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
1315                                                 const float angle, const float angleEpsilon,
1316                                                 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
1317         {
1318             extern __shared__ int s_SHist[];
1319             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1320                 s_SHist[i] = 0;
1321             __syncthreads();
1322
1323             const int tIdx = blockIdx.x;
1324             const int level = blockIdx.y;
1325
1326             const int tSize = templSizes[level];
1327
1328             if (tIdx < tSize)
1329             {
1330                 const int imSize = imageSizes[level];
1331
1332                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1333                 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
1334
1335                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1336                 {
1337                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1338                     const float im_d12 = ImageFeatureTable::d12(level)[i];
1339
1340                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1341                     {
1342                         const float scale = im_d12 / t_d12;
1343
1344                         if (scale >= minScale && scale <= maxScale)
1345                         {
1346                             const int s = __float2int_rn((scale - minScale) * iScaleStep);
1347                             Emulation::smem::atomicAdd(&s_SHist[s], 1);
1348                         }
1349                     }
1350                 }
1351             }
1352             __syncthreads();
1353
1354             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1355                 ::atomicAdd(SHist + i, s_SHist[i]);
1356         }
1357
1358         void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
1359                                          float angle, float angleEpsilon,
1360                                          float minScale, float maxScale, float iScaleStep, int scaleRange,
1361                                          int levels, int tMaxSize)
1362         {
1363             const dim3 block(256);
1364             const dim3 grid(tMaxSize, levels + 1);
1365
1366             angle *= (CV_PI_F / 180.0f);
1367             angleEpsilon *= (CV_PI_F / 180.0f);
1368
1369             const size_t smemSize = (scaleRange + 1) * sizeof(float);
1370
1371             GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
1372                                                                angle, angleEpsilon,
1373                                                                minScale, maxScale, iScaleStep, scaleRange);
1374             cudaSafeCall( cudaGetLastError() );
1375
1376             cudaSafeCall( cudaDeviceSynchronize() );
1377         }
1378
1379         __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1380                                                 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
1381                                                 const float idp)
1382         {
1383             const int tIdx = blockIdx.x;
1384             const int level = blockIdx.y;
1385
1386             const int tSize = templSizes[level];
1387
1388             if (tIdx < tSize)
1389             {
1390                 const int imSize = imageSizes[level];
1391
1392                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1393
1394                 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
1395                 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
1396
1397                 r1 = r1 * scale;
1398                 r2 = r2 * scale;
1399
1400                 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
1401                 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
1402
1403                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1404                 {
1405                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1406
1407                     const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
1408                     const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
1409
1410                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1411                     {
1412                         float2 c1, c2;
1413
1414                         c1 = im_p1_pos - r1;
1415                         c1 = c1 * idp;
1416
1417                         c2 = im_p2_pos - r2;
1418                         c2 = c2 * idp;
1419
1420                         if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
1421                             continue;
1422
1423                         if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
1424                             ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
1425                     }
1426                 }
1427             }
1428         }
1429
1430         void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1431                                          float angle, float angleEpsilon, float scale,
1432                                          float dp,
1433                                          int levels, int tMaxSize)
1434         {
1435             const dim3 block(256);
1436             const dim3 grid(tMaxSize, levels + 1);
1437
1438             angle *= (CV_PI_F / 180.0f);
1439             angleEpsilon *= (CV_PI_F / 180.0f);
1440
1441             const float sinVal = ::sinf(angle);
1442             const float cosVal = ::cosf(angle);
1443
1444             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
1445
1446             GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
1447                                                      angle, sinVal, cosVal, angleEpsilon, scale,
1448                                                      1.0f / dp);
1449             cudaSafeCall( cudaGetLastError() );
1450
1451             cudaSafeCall( cudaDeviceSynchronize() );
1452         }
1453
1454         __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
1455                                                     const float angle, const int angleVotes, const float scale, const int scaleVotes,
1456                                                     const float dp, const int threshold)
1457         {
1458             const int x = blockIdx.x * blockDim.x + threadIdx.x;
1459             const int y = blockIdx.y * blockDim.y + threadIdx.y;
1460
1461             if (x >= hist.cols - 2 || y >= hist.rows - 2)
1462                 return;
1463
1464             const int curVotes = hist(y + 1, x + 1);
1465
1466             if (curVotes > threshold &&
1467                 curVotes >  hist(y + 1, x) &&
1468                 curVotes >= hist(y + 1, x + 2) &&
1469                 curVotes >  hist(y, x + 1) &&
1470                 curVotes >= hist(y + 2, x + 1))
1471             {
1472                 const int ind = ::atomicAdd(&g_counter, 1);
1473
1474                 if (ind < maxSize)
1475                 {
1476                     out[ind] = make_float4(x * dp, y * dp, scale, angle);
1477                     votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
1478                 }
1479             }
1480         }
1481
1482         int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
1483                                              float angle, int angleVotes, float scale, int scaleVotes,
1484                                              float dp, int threshold)
1485         {
1486             void* counterPtr;
1487             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
1488
1489             cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
1490
1491             const dim3 block(32, 8);
1492             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
1493
1494             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
1495
1496             GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
1497                                                          angle, angleVotes, scale, scaleVotes,
1498                                                          dp, threshold);
1499             cudaSafeCall( cudaGetLastError() );
1500
1501             cudaSafeCall( cudaDeviceSynchronize() );
1502
1503             int totalCount;
1504             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
1505
1506             totalCount = ::min(totalCount, maxSize);
1507
1508             return totalCount;
1509         }
1510     }
1511 }}}
1512
1513
1514 #endif /* CUDA_DISABLER */