modules/gpu/src/cuda/hough.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include <thrust/sort.h>
  44 #include "opencv2/gpu/device/common.hpp"
  45 #include "opencv2/gpu/device/emulation.hpp"
  46 #include "opencv2/gpu/device/vec_math.hpp"
  47 #include "opencv2/gpu/device/limits.hpp"
  48 #include "opencv2/gpu/device/dynamic_smem.hpp"
  49
  50 namespace cv { namespace gpu { namespace device
  51 {
  52     namespace hough
  53     {
  54         __device__ int g_counter;
  55
  56         ////////////////////////////////////////////////////////////////////////
  57         // buildPointList
  58
  59         template <int PIXELS_PER_THREAD>
  60         __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
  61         {
  62             __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
  63             __shared__ int s_qsize[4];
  64             __shared__ int s_globStart[4];
  65
  66             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
  67             const int y = blockIdx.y * blockDim.y + threadIdx.y;
  68
  69             if (threadIdx.x == 0)
  70                 s_qsize[threadIdx.y] = 0;
  71             __syncthreads();
  72
  73             if (y < src.rows)
  74             {
  75                 // fill the queue
  76                 const uchar* srcRow = src.ptr(y);
  77                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
  78                 {
  79                     if (srcRow[xx])
  80                     {
  81                         const unsigned int val = (y << 16) | xx;
  82                         const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
  83                         s_queues[threadIdx.y][qidx] = val;
  84                     }
  85                 }
  86             }
  87
  88             __syncthreads();
  89
  90             // let one thread reserve the space required in the global list
  91             if (threadIdx.x == 0 && threadIdx.y == 0)
  92             {
  93                 // find how many items are stored in each list
  94                 int totalSize = 0;
  95                 for (int i = 0; i < blockDim.y; ++i)
  96                 {
  97                     s_globStart[i] = totalSize;
  98                     totalSize += s_qsize[i];
  99                 }
 100
 101                 // calculate the offset in the global list
 102                 const int globalOffset = atomicAdd(&g_counter, totalSize);
 103                 for (int i = 0; i < blockDim.y; ++i)
 104                     s_globStart[i] += globalOffset;
 105             }
 106
 107             __syncthreads();
 108
 109             // copy local queues to global queue
 110             const int qsize = s_qsize[threadIdx.y];
 111             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
 112             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
 113                 list[gidx] = s_queues[threadIdx.y][i];
 114         }
 115
 116         int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
 117         {
 118             const int PIXELS_PER_THREAD = 16;
 119
 120             void* counterPtr;
 121             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 122
 123             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 124
 125             const dim3 block(32, 4);
 126             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
 127
 128             cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
 129
 130             buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
 131             cudaSafeCall( cudaGetLastError() );
 132
 133             cudaSafeCall( cudaDeviceSynchronize() );
 134
 135             int totalCount;
 136             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 137
 138             return totalCount;
 139         }
 140
 141         ////////////////////////////////////////////////////////////////////////
 142         // linesAccum
 143
 144         __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
 145         {
 146             const int n = blockIdx.x;
 147             const float ang = n * theta;
 148
 149             float sinVal;
 150             float cosVal;
 151             sincosf(ang, &sinVal, &cosVal);
 152             sinVal *= irho;
 153             cosVal *= irho;
 154
 155             const int shift = (numrho - 1) / 2;
 156
 157             int* accumRow = accum.ptr(n + 1);
 158             for (int i = threadIdx.x; i < count; i += blockDim.x)
 159             {
 160                 const unsigned int val = list[i];
 161
 162                 const int x = (val & 0xFFFF);
 163                 const int y = (val >> 16) & 0xFFFF;
 164
 165                 int r = __float2int_rn(x * cosVal + y * sinVal);
 166                 r += shift;
 167
 168                 ::atomicAdd(accumRow + r + 1, 1);
 169             }
 170         }
 171
 172         __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
 173         {
 174             int* smem = DynamicSharedMem<int>();
 175
 176             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
 177                 smem[i] = 0;
 178
 179             __syncthreads();
 180
 181             const int n = blockIdx.x;
 182             const float ang = n * theta;
 183
 184             float sinVal;
 185             float cosVal;
 186             sincosf(ang, &sinVal, &cosVal);
 187             sinVal *= irho;
 188             cosVal *= irho;
 189
 190             const int shift = (numrho - 1) / 2;
 191
 192             for (int i = threadIdx.x; i < count; i += blockDim.x)
 193             {
 194                 const unsigned int val = list[i];
 195
 196                 const int x = (val & 0xFFFF);
 197                 const int y = (val >> 16) & 0xFFFF;
 198
 199                 int r = __float2int_rn(x * cosVal + y * sinVal);
 200                 r += shift;
 201
 202                 Emulation::smem::atomicAdd(&smem[r + 1], 1);
 203             }
 204
 205             __syncthreads();
 206
 207             int* accumRow = accum.ptr(n + 1);
 208             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
 209                 accumRow[i] = smem[i];
 210         }
 211
 212         void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
 213         {
 214             const dim3 block(has20 ? 1024 : 512);
 215             const dim3 grid(accum.rows - 2);
 216
 217             size_t smemSize = (accum.cols - 1) * sizeof(int);
 218
 219             if (smemSize < sharedMemPerBlock - 1000)
 220                 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
 221             else
 222                 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
 223
 224             cudaSafeCall( cudaGetLastError() );
 225
 226             cudaSafeCall( cudaDeviceSynchronize() );
 227         }
 228
 229         ////////////////////////////////////////////////////////////////////////
 230         // linesGetResult
 231
 232         __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
 233         {
 234             const int r = blockIdx.x * blockDim.x + threadIdx.x;
 235             const int n = blockIdx.y * blockDim.y + threadIdx.y;
 236
 237             if (r >= accum.cols - 2 && n >= accum.rows - 2)
 238                 return;
 239
 240             const int curVotes = accum(n + 1, r + 1);
 241
 242             if (curVotes > threshold &&
 243                 curVotes >  accum(n + 1, r) &&
 244                 curVotes >= accum(n + 1, r + 2) &&
 245                 curVotes >  accum(n, r + 1) &&
 246                 curVotes >= accum(n + 2, r + 1))
 247             {
 248                 const float radius = (r - (numrho - 1) * 0.5f) * rho;
 249                 const float angle = n * theta;
 250
 251                 const int ind = ::atomicAdd(&g_counter, 1);
 252                 if (ind < maxSize)
 253                 {
 254                     out[ind] = make_float2(radius, angle);
 255                     votes[ind] = curVotes;
 256                 }
 257             }
 258         }
 259
 260         int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
 261         {
 262             void* counterPtr;
 263             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 264
 265             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 266
 267             const dim3 block(32, 8);
 268             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 269
 270             cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
 271
 272             linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
 273             cudaSafeCall( cudaGetLastError() );
 274
 275             cudaSafeCall( cudaDeviceSynchronize() );
 276
 277             int totalCount;
 278             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 279
 280             totalCount = ::min(totalCount, maxSize);
 281
 282             if (doSort && totalCount > 0)
 283             {
 284                 thrust::device_ptr<float2> outPtr(out);
 285                 thrust::device_ptr<int> votesPtr(votes);
 286                 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
 287             }
 288
 289             return totalCount;
 290         }
 291
 292         ////////////////////////////////////////////////////////////////////////
 293         // circlesAccumCenters
 294
 295         __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
 296                                             PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
 297         {
 298             const int SHIFT = 10;
 299             const int ONE = 1 << SHIFT;
 300
 301             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 302
 303             if (tid >= count)
 304                 return;
 305
 306             const unsigned int val = list[tid];
 307
 308             const int x = (val & 0xFFFF);
 309             const int y = (val >> 16) & 0xFFFF;
 310
 311             const int vx = dx(y, x);
 312             const int vy = dy(y, x);
 313
 314             if (vx == 0 && vy == 0)
 315                 return;
 316
 317             const float mag = ::sqrtf(vx * vx + vy * vy);
 318
 319             const int x0 = __float2int_rn((x * idp) * ONE);
 320             const int y0 = __float2int_rn((y * idp) * ONE);
 321
 322             int sx = __float2int_rn((vx * idp) * ONE / mag);
 323             int sy = __float2int_rn((vy * idp) * ONE / mag);
 324
 325             // Step from minRadius to maxRadius in both directions of the gradient
 326             for (int k1 = 0; k1 < 2; ++k1)
 327             {
 328                 int x1 = x0 + minRadius * sx;
 329                 int y1 = y0 + minRadius * sy;
 330
 331                 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
 332                 {
 333                     const int x2 = x1 >> SHIFT;
 334                     const int y2 = y1 >> SHIFT;
 335
 336                     if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
 337                         break;
 338
 339                     ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
 340                 }
 341
 342                 sx = -sx;
 343                 sy = -sy;
 344             }
 345         }
 346
 347         void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
 348         {
 349             const dim3 block(256);
 350             const dim3 grid(divUp(count, block.x));
 351
 352             cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
 353
 354             circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
 355             cudaSafeCall( cudaGetLastError() );
 356
 357             cudaSafeCall( cudaDeviceSynchronize() );
 358         }
 359
 360         ////////////////////////////////////////////////////////////////////////
 361         // buildCentersList
 362
 363         __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
 364         {
 365             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 366             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 367
 368             if (x < accum.cols - 2 && y < accum.rows - 2)
 369             {
 370                 const int top = accum(y, x + 1);
 371
 372                 const int left = accum(y + 1, x);
 373                 const int cur = accum(y + 1, x + 1);
 374                 const int right = accum(y + 1, x + 2);
 375
 376                 const int bottom = accum(y + 2, x + 1);
 377
 378                 if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
 379                 {
 380                     const unsigned int val = (y << 16) | x;
 381                     const int idx = ::atomicAdd(&g_counter, 1);
 382                     centers[idx] = val;
 383                 }
 384             }
 385         }
 386
 387         int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
 388         {
 389             void* counterPtr;
 390             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 391
 392             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 393
 394             const dim3 block(32, 8);
 395             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 396
 397             cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
 398
 399             buildCentersList<<<grid, block>>>(accum, centers, threshold);
 400             cudaSafeCall( cudaGetLastError() );
 401
 402             cudaSafeCall( cudaDeviceSynchronize() );
 403
 404             int totalCount;
 405             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 406
 407             return totalCount;
 408         }
 409
 410         ////////////////////////////////////////////////////////////////////////
 411         // circlesAccumRadius
 412
 413         __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
 414                                            float3* circles, const int maxCircles, const float dp,
 415                                            const int minRadius, const int maxRadius, const int histSize, const int threshold)
 416         {
 417             int* smem = DynamicSharedMem<int>();
 418
 419             for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
 420                 smem[i] = 0;
 421             __syncthreads();
 422
 423             unsigned int val = centers[blockIdx.x];
 424
 425             float cx = (val & 0xFFFF);
 426             float cy = (val >> 16) & 0xFFFF;
 427
 428             cx = (cx + 0.5f) * dp;
 429             cy = (cy + 0.5f) * dp;
 430
 431             for (int i = threadIdx.x; i < count; i += blockDim.x)
 432             {
 433                 val = list[i];
 434
 435                 const int x = (val & 0xFFFF);
 436                 const int y = (val >> 16) & 0xFFFF;
 437
 438                 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
 439                 if (rad >= minRadius && rad <= maxRadius)
 440                 {
 441                     const int r = __float2int_rn(rad - minRadius);
 442
 443                     Emulation::smem::atomicAdd(&smem[r + 1], 1);
 444                 }
 445             }
 446
 447             __syncthreads();
 448
 449             for (int i = threadIdx.x; i < histSize; i += blockDim.x)
 450             {
 451                 const int curVotes = smem[i + 1];
 452
 453                 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
 454                 {
 455                     const int ind = ::atomicAdd(&g_counter, 1);
 456                     if (ind < maxCircles)
 457                         circles[ind] = make_float3(cx, cy, i + minRadius);
 458                 }
 459             }
 460         }
 461
 462         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
 463                                    float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
 464         {
 465             void* counterPtr;
 466             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 467
 468             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 469
 470             const dim3 block(has20 ? 1024 : 512);
 471             const dim3 grid(centersCount);
 472
 473             const int histSize = maxRadius - minRadius + 1;
 474             size_t smemSize = (histSize + 2) * sizeof(int);
 475
 476             circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
 477             cudaSafeCall( cudaGetLastError() );
 478
 479             cudaSafeCall( cudaDeviceSynchronize() );
 480
 481             int totalCount;
 482             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 483
 484             totalCount = ::min(totalCount, maxCircles);
 485
 486             return totalCount;
 487         }
 488
 489         ////////////////////////////////////////////////////////////////////////
 490         // Generalized Hough
 491
 492         template <typename T, int PIXELS_PER_THREAD>
 493         __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
 494         {
 495             __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
 496             __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
 497             __shared__ int s_sizes[4];
 498             __shared__ int s_globStart[4];
 499
 500             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
 501             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 502
 503             if (threadIdx.x == 0)
 504                 s_sizes[threadIdx.y] = 0;
 505             __syncthreads();
 506
 507             if (y < edges.rows)
 508             {
 509                 // fill the queue
 510                 const uchar* edgesRow = edges.ptr(y);
 511                 const T* dxRow = dx.ptr(y);
 512                 const T* dyRow = dy.ptr(y);
 513
 514                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
 515                 {
 516                     const T dxVal = dxRow[xx];
 517                     const T dyVal = dyRow[xx];
 518
 519                     if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
 520                     {
 521                         const unsigned int coord = (y << 16) | xx;
 522
 523                         float theta = ::atan2f(dyVal, dxVal);
 524                         if (theta < 0)
 525                             theta += 2.0f * CV_PI_F;
 526
 527                         const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
 528
 529                         s_coordLists[threadIdx.y][qidx] = coord;
 530                         s_thetaLists[threadIdx.y][qidx] = theta;
 531                     }
 532                 }
 533             }
 534
 535             __syncthreads();
 536
 537             // let one thread reserve the space required in the global list
 538             if (threadIdx.x == 0 && threadIdx.y == 0)
 539             {
 540                 // find how many items are stored in each list
 541                 int totalSize = 0;
 542                 for (int i = 0; i < blockDim.y; ++i)
 543                 {
 544                     s_globStart[i] = totalSize;
 545                     totalSize += s_sizes[i];
 546                 }
 547
 548                 // calculate the offset in the global list
 549                 const int globalOffset = atomicAdd(&g_counter, totalSize);
 550                 for (int i = 0; i < blockDim.y; ++i)
 551                     s_globStart[i] += globalOffset;
 552             }
 553
 554             __syncthreads();
 555
 556             // copy local queues to global queue
 557             const int qsize = s_sizes[threadIdx.y];
 558             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
 559             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
 560             {
 561                 coordList[gidx] = s_coordLists[threadIdx.y][i];
 562                 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
 563             }
 564         }
 565
 566         template <typename T>
 567         int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
 568         {
 569             const int PIXELS_PER_THREAD = 8;
 570
 571             void* counterPtr;
 572             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 573
 574             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 575
 576             const dim3 block(32, 4);
 577             const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
 578
 579             cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
 580
 581             buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
 582             cudaSafeCall( cudaGetLastError() );
 583
 584             cudaSafeCall( cudaDeviceSynchronize() );
 585
 586             int totalCount;
 587             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 588
 589             return totalCount;
 590         }
 591
 592         template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
 593         template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
 594         template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
 595
 596         __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
 597                                     PtrStep<short2> r_table, int* r_sizes, int maxSize,
 598                                     const short2 templCenter, const float thetaScale)
 599         {
 600             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 601
 602             if (tid >= pointsCount)
 603                 return;
 604
 605             const unsigned int coord = coordList[tid];
 606             short2 p;
 607             p.x = (coord & 0xFFFF);
 608             p.y = (coord >> 16) & 0xFFFF;
 609
 610             const float theta = thetaList[tid];
 611             const int n = __float2int_rn(theta * thetaScale);
 612
 613             const int ind = ::atomicAdd(r_sizes + n, 1);
 614             if (ind < maxSize)
 615                 r_table(n, ind) = p - templCenter;
 616         }
 617
 618         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 619                              PtrStepSz<short2> r_table, int* r_sizes,
 620                              short2 templCenter, int levels)
 621         {
 622             const dim3 block(256);
 623             const dim3 grid(divUp(pointsCount, block.x));
 624
 625             const float thetaScale = levels / (2.0f * CV_PI_F);
 626
 627             buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
 628             cudaSafeCall( cudaGetLastError() );
 629
 630             cudaSafeCall( cudaDeviceSynchronize() );
 631         }
 632
 633         ////////////////////////////////////////////////////////////////////////
 634         // GHT_Ballard_Pos
 635
 636         __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
 637                                                  const PtrStep<short2> r_table, const int* r_sizes,
 638                                                  PtrStepSzi hist,
 639                                                  const float idp, const float thetaScale)
 640         {
 641             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 642
 643             if (tid >= pointsCount)
 644                 return;
 645
 646             const unsigned int coord = coordList[tid];
 647             short2 p;
 648             p.x = (coord & 0xFFFF);
 649             p.y = (coord >> 16) & 0xFFFF;
 650
 651             const float theta = thetaList[tid];
 652             const int n = __float2int_rn(theta * thetaScale);
 653
 654             const short2* r_row = r_table.ptr(n);
 655             const int r_row_size = r_sizes[n];
 656
 657             for (int j = 0; j < r_row_size; ++j)
 658             {
 659                 short2 c = p - r_row[j];
 660
 661                 c.x = __float2int_rn(c.x * idp);
 662                 c.y = __float2int_rn(c.y * idp);
 663
 664                 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
 665                     ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
 666             }
 667         }
 668
 669         void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 670                                           PtrStepSz<short2> r_table, const int* r_sizes,
 671                                           PtrStepSzi hist,
 672                                           float dp, int levels)
 673         {
 674             const dim3 block(256);
 675             const dim3 grid(divUp(pointsCount, block.x));
 676
 677             const float idp = 1.0f / dp;
 678             const float thetaScale = levels / (2.0f * CV_PI_F);
 679
 680             GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
 681             cudaSafeCall( cudaGetLastError() );
 682
 683             cudaSafeCall( cudaDeviceSynchronize() );
 684         }
 685
 686         __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
 687         {
 688             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 689             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 690
 691             if (x >= hist.cols - 2 || y >= hist.rows - 2)
 692                 return;
 693
 694             const int curVotes = hist(y + 1, x + 1);
 695
 696             if (curVotes > threshold &&
 697                 curVotes >  hist(y + 1, x) &&
 698                 curVotes >= hist(y + 1, x + 2) &&
 699                 curVotes >  hist(y, x + 1) &&
 700                 curVotes >= hist(y + 2, x + 1))
 701             {
 702                 const int ind = ::atomicAdd(&g_counter, 1);
 703
 704                 if (ind < maxSize)
 705                 {
 706                     out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
 707                     votes[ind] = make_int3(curVotes, 0, 0);
 708                 }
 709             }
 710         }
 711
 712         int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
 713         {
 714             void* counterPtr;
 715             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 716
 717             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 718
 719             const dim3 block(32, 8);
 720             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
 721
 722             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
 723
 724             GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
 725             cudaSafeCall( cudaGetLastError() );
 726
 727             cudaSafeCall( cudaDeviceSynchronize() );
 728
 729             int totalCount;
 730             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 731
 732             totalCount = ::min(totalCount, maxSize);
 733
 734             return totalCount;
 735         }
 736
 737         ////////////////////////////////////////////////////////////////////////
 738         // GHT_Ballard_PosScale
 739
 740         __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
 741                                                       PtrStep<short2> r_table, const int* r_sizes,
 742                                                       PtrStepi hist, const int rows, const int cols,
 743                                                       const float minScale, const float scaleStep, const int scaleRange,
 744                                                       const float idp, const float thetaScale)
 745         {
 746             const unsigned int coord = coordList[blockIdx.x];
 747             float2 p;
 748             p.x = (coord & 0xFFFF);
 749             p.y = (coord >> 16) & 0xFFFF;
 750
 751             const float theta = thetaList[blockIdx.x];
 752             const int n = __float2int_rn(theta * thetaScale);
 753
 754             const short2* r_row = r_table.ptr(n);
 755             const int r_row_size = r_sizes[n];
 756
 757             for (int j = 0; j < r_row_size; ++j)
 758             {
 759                 const float2 d = saturate_cast<float2>(r_row[j]);
 760
 761                 for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
 762                 {
 763                     const float scale = minScale + s * scaleStep;
 764
 765                     float2 c = p - scale * d;
 766
 767                     c.x *= idp;
 768                     c.y *= idp;
 769
 770                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
 771                         ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
 772                 }
 773             }
 774         }
 775
 776         void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 777                                                PtrStepSz<short2> r_table, const int* r_sizes,
 778                                                PtrStepi hist, int rows, int cols,
 779                                                float minScale, float scaleStep, int scaleRange,
 780                                                float dp, int levels)
 781         {
 782             const dim3 block(256);
 783             const dim3 grid(pointsCount);
 784
 785             const float idp = 1.0f / dp;
 786             const float thetaScale = levels / (2.0f * CV_PI_F);
 787
 788             GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
 789                                                            r_table, r_sizes,
 790                                                            hist, rows, cols,
 791                                                            minScale, scaleStep, scaleRange,
 792                                                            idp, thetaScale);
 793             cudaSafeCall( cudaGetLastError() );
 794
 795             cudaSafeCall( cudaDeviceSynchronize() );
 796         }
 797
 798         __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
 799                                                            float4* out, int3* votes, const int maxSize,
 800                                                            const float minScale, const float scaleStep, const float dp, const int threshold)
 801         {
 802             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 803             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 804
 805             if (x >= cols || y >= rows)
 806                 return;
 807
 808             for (int s = 0; s < scaleRange; ++s)
 809             {
 810                 const float scale = minScale + s * scaleStep;
 811
 812                 const int prevScaleIdx = (s) * (rows + 2);
 813                 const int curScaleIdx = (s + 1) * (rows + 2);
 814                 const int nextScaleIdx = (s + 2) * (rows + 2);
 815
 816                 const int curVotes = hist(curScaleIdx + y + 1, x + 1);
 817
 818                 if (curVotes > threshold &&
 819                     curVotes >  hist(curScaleIdx + y + 1, x) &&
 820                     curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
 821                     curVotes >  hist(curScaleIdx + y, x + 1) &&
 822                     curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
 823                     curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
 824                     curVotes >= hist(nextScaleIdx + y + 1, x + 1))
 825                 {
 826                     const int ind = ::atomicAdd(&g_counter, 1);
 827
 828                     if (ind < maxSize)
 829                     {
 830                         out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
 831                         votes[ind] = make_int3(curVotes, curVotes, 0);
 832                     }
 833                 }
 834             }
 835         }
 836
 837         int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
 838                                                    float minScale, float scaleStep, float dp, int threshold)
 839         {
 840             void* counterPtr;
 841             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 842
 843             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 844
 845             const dim3 block(32, 8);
 846             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
 847
 848             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
 849
 850             GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
 851             cudaSafeCall( cudaGetLastError() );
 852
 853             cudaSafeCall( cudaDeviceSynchronize() );
 854
 855             int totalCount;
 856             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 857
 858             totalCount = ::min(totalCount, maxSize);
 859
 860             return totalCount;
 861         }
 862
 863         ////////////////////////////////////////////////////////////////////////
 864         // GHT_Ballard_PosRotation
 865
 866         __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
 867                                                          PtrStep<short2> r_table, const int* r_sizes,
 868                                                          PtrStepi hist, const int rows, const int cols,
 869                                                          const float minAngle, const float angleStep, const int angleRange,
 870                                                          const float idp, const float thetaScale)
 871         {
 872             const unsigned int coord = coordList[blockIdx.x];
 873             float2 p;
 874             p.x = (coord & 0xFFFF);
 875             p.y = (coord >> 16) & 0xFFFF;
 876
 877             const float thetaVal = thetaList[blockIdx.x];
 878
 879             for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
 880             {
 881                 const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
 882                 float sinA, cosA;
 883                 sincosf(angle, &sinA, &cosA);
 884
 885                 float theta = thetaVal - angle;
 886                 if (theta < 0)
 887                     theta += 2.0f * CV_PI_F;
 888
 889                 const int n = __float2int_rn(theta * thetaScale);
 890
 891                 const short2* r_row = r_table.ptr(n);
 892                 const int r_row_size = r_sizes[n];
 893
 894                 for (int j = 0; j < r_row_size; ++j)
 895                 {
 896                     const float2 d = saturate_cast<float2>(r_row[j]);
 897
 898                     const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
 899
 900                     float2 c = make_float2(p.x - dr.x, p.y - dr.y);
 901                     c.x *= idp;
 902                     c.y *= idp;
 903
 904                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
 905                         ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
 906                 }
 907             }
 908         }
 909
 910         void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 911                                                   PtrStepSz<short2> r_table, const int* r_sizes,
 912                                                   PtrStepi hist, int rows, int cols,
 913                                                   float minAngle, float angleStep, int angleRange,
 914                                                   float dp, int levels)
 915         {
 916             const dim3 block(256);
 917             const dim3 grid(pointsCount);
 918
 919             const float idp = 1.0f / dp;
 920             const float thetaScale = levels / (2.0f * CV_PI_F);
 921
 922             GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
 923                                                               r_table, r_sizes,
 924                                                               hist, rows, cols,
 925                                                               minAngle, angleStep, angleRange,
 926                                                               idp, thetaScale);
 927             cudaSafeCall( cudaGetLastError() );
 928
 929             cudaSafeCall( cudaDeviceSynchronize() );
 930         }
 931
 932         __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
 933                                                               float4* out, int3* votes, const int maxSize,
 934                                                               const float minAngle, const float angleStep, const float dp, const int threshold)
 935         {
 936             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 937             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 938
 939             if (x >= cols || y >= rows)
 940                 return;
 941
 942             for (int a = 0; a < angleRange; ++a)
 943             {
 944                 const float angle = minAngle + a * angleStep;
 945
 946                 const int prevAngleIdx = (a) * (rows + 2);
 947                 const int curAngleIdx = (a + 1) * (rows + 2);
 948                 const int nextAngleIdx = (a + 2) * (rows + 2);
 949
 950                 const int curVotes = hist(curAngleIdx + y + 1, x + 1);
 951
 952                 if (curVotes > threshold &&
 953                     curVotes >  hist(curAngleIdx + y + 1, x) &&
 954                     curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
 955                     curVotes >  hist(curAngleIdx + y, x + 1) &&
 956                     curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
 957                     curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
 958                     curVotes >= hist(nextAngleIdx + y + 1, x + 1))
 959                 {
 960                     const int ind = ::atomicAdd(&g_counter, 1);
 961
 962                     if (ind < maxSize)
 963                     {
 964                         out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
 965                         votes[ind] = make_int3(curVotes, 0, curVotes);
 966                     }
 967                 }
 968             }
 969         }
 970
 971         int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
 972                                                       float minAngle, float angleStep, float dp, int threshold)
 973         {
 974             void* counterPtr;
 975             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 976
 977             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 978
 979             const dim3 block(32, 8);
 980             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
 981
 982             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
 983
 984             GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
 985             cudaSafeCall( cudaGetLastError() );
 986
 987             cudaSafeCall( cudaDeviceSynchronize() );
 988
 989             int totalCount;
 990             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 991
 992             totalCount = ::min(totalCount, maxSize);
 993
 994             return totalCount;
 995         }
 996
 997         ////////////////////////////////////////////////////////////////////////
 998         // GHT_Guil_Full
 999
1000         struct FeatureTable
1001         {
1002             uchar* p1_pos_data;
1003             size_t p1_pos_step;
1004
1005             uchar* p1_theta_data;
1006             size_t p1_theta_step;
1007
1008             uchar* p2_pos_data;
1009             size_t p2_pos_step;
1010
1011             uchar* d12_data;
1012             size_t d12_step;
1013
1014             uchar* r1_data;
1015             size_t r1_step;
1016
1017             uchar* r2_data;
1018             size_t r2_step;
1019         };
1020
1021         __constant__ FeatureTable c_templFeatures;
1022         __constant__ FeatureTable c_imageFeatures;
1023
1024         void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1025         {
1026             FeatureTable tbl;
1027
1028             tbl.p1_pos_data = p1_pos.data;
1029             tbl.p1_pos_step = p1_pos.step;
1030
1031             tbl.p1_theta_data = p1_theta.data;
1032             tbl.p1_theta_step = p1_theta.step;
1033
1034             tbl.p2_pos_data = p2_pos.data;
1035             tbl.p2_pos_step = p2_pos.step;
1036
1037             tbl.d12_data = d12.data;
1038             tbl.d12_step = d12.step;
1039
1040             tbl.r1_data = r1.data;
1041             tbl.r1_step = r1.step;
1042
1043             tbl.r2_data = r2.data;
1044             tbl.r2_step = r2.step;
1045
1046             cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
1047         }
1048         void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1049         {
1050             FeatureTable tbl;
1051
1052             tbl.p1_pos_data = p1_pos.data;
1053             tbl.p1_pos_step = p1_pos.step;
1054
1055             tbl.p1_theta_data = p1_theta.data;
1056             tbl.p1_theta_step = p1_theta.step;
1057
1058             tbl.p2_pos_data = p2_pos.data;
1059             tbl.p2_pos_step = p2_pos.step;
1060
1061             tbl.d12_data = d12.data;
1062             tbl.d12_step = d12.step;
1063
1064             tbl.r1_data = r1.data;
1065             tbl.r1_step = r1.step;
1066
1067             tbl.r2_data = r2.data;
1068             tbl.r2_step = r2.step;
1069
1070             cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
1071         }
1072
1073         struct TemplFeatureTable
1074         {
1075             static __device__ float2* p1_pos(int n)
1076             {
1077                 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
1078             }
1079             static __device__ float* p1_theta(int n)
1080             {
1081                 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
1082             }
1083             static __device__ float2* p2_pos(int n)
1084             {
1085                 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
1086             }
1087
1088             static __device__ float* d12(int n)
1089             {
1090                 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
1091             }
1092
1093             static __device__ float2* r1(int n)
1094             {
1095                 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
1096             }
1097             static __device__ float2* r2(int n)
1098             {
1099                 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
1100             }
1101         };
1102         struct ImageFeatureTable
1103         {
1104             static __device__ float2* p1_pos(int n)
1105             {
1106                 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
1107             }
1108             static __device__ float* p1_theta(int n)
1109             {
1110                 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
1111             }
1112             static __device__ float2* p2_pos(int n)
1113             {
1114                 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
1115             }
1116
1117             static __device__ float* d12(int n)
1118             {
1119                 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
1120             }
1121
1122             static __device__ float2* r1(int n)
1123             {
1124                 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
1125             }
1126             static __device__ float2* r2(int n)
1127             {
1128                 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
1129             }
1130         };
1131
1132         __device__ float clampAngle(float a)
1133         {
1134             float res = a;
1135
1136             while (res > 2.0f * CV_PI_F)
1137                 res -= 2.0f * CV_PI_F;
1138             while (res < 0.0f)
1139                 res += 2.0f * CV_PI_F;
1140
1141             return res;
1142         }
1143
1144         __device__ bool angleEq(float a, float b, float eps)
1145         {
1146             return (::fabs(clampAngle(a - b)) <= eps);
1147         }
1148
1149         template <class FT, bool isTempl>
1150         __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
1151                                                        int* sizes, const int maxSize,
1152                                                        const float xi, const float angleEpsilon, const float alphaScale,
1153                                                        const float2 center, const float maxDist)
1154         {
1155             const float p1_theta = thetaList[blockIdx.x];
1156             const unsigned int coord1 = coordList[blockIdx.x];
1157             float2 p1_pos;
1158             p1_pos.x = (coord1 & 0xFFFF);
1159             p1_pos.y = (coord1 >> 16) & 0xFFFF;
1160
1161             for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
1162             {
1163                 const float p2_theta = thetaList[i];
1164                 const unsigned int coord2 = coordList[i];
1165                 float2 p2_pos;
1166                 p2_pos.x = (coord2 & 0xFFFF);
1167                 p2_pos.y = (coord2 >> 16) & 0xFFFF;
1168
1169                 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
1170                 {
1171                     const float2 d = p1_pos - p2_pos;
1172
1173                     float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
1174                     float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
1175
1176                     if (d12 > maxDist)
1177                         continue;
1178
1179                     float2 r1 = p1_pos - center;
1180                     float2 r2 = p2_pos - center;
1181
1182                     const int n = __float2int_rn(alpha12 * alphaScale);
1183
1184                     const int ind = ::atomicAdd(sizes + n, 1);
1185
1186                     if (ind < maxSize)
1187                     {
1188                         if (!isTempl)
1189                         {
1190                             FT::p1_pos(n)[ind] = p1_pos;
1191                             FT::p2_pos(n)[ind] = p2_pos;
1192                         }
1193
1194                         FT::p1_theta(n)[ind] = p1_theta;
1195
1196                         FT::d12(n)[ind] = d12;
1197
1198                         if (isTempl)
1199                         {
1200                             FT::r1(n)[ind] = r1;
1201                             FT::r2(n)[ind] = r2;
1202                         }
1203                     }
1204                 }
1205             }
1206         }
1207
1208         template <class FT, bool isTempl>
1209         void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
1210                                                    int* sizes, int maxSize,
1211                                                    float xi, float angleEpsilon, int levels,
1212                                                    float2 center, float maxDist)
1213         {
1214             const dim3 block(256);
1215             const dim3 grid(pointsCount);
1216
1217             const float alphaScale = levels / (2.0f * CV_PI_F);
1218
1219             GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
1220                                                                          sizes, maxSize,
1221                                                                          xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
1222                                                                          center, maxDist);
1223             cudaSafeCall( cudaGetLastError() );
1224
1225             cudaSafeCall( cudaDeviceSynchronize() );
1226
1227             thrust::device_ptr<int> sizesPtr(sizes);
1228             thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
1229         }
1230
1231         void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1232                                                      int* sizes, int maxSize,
1233                                                      float xi, float angleEpsilon, int levels,
1234                                                      float2 center, float maxDist)
1235         {
1236             GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
1237                                                                            sizes, maxSize,
1238                                                                            xi, angleEpsilon, levels,
1239                                                                            center, maxDist);
1240         }
1241         void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1242                                                      int* sizes, int maxSize,
1243                                                      float xi, float angleEpsilon, int levels,
1244                                                      float2 center, float maxDist)
1245         {
1246             GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
1247                                                                             sizes, maxSize,
1248                                                                             xi, angleEpsilon, levels,
1249                                                                             center, maxDist);
1250         }
1251
1252         __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
1253                                                 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
1254         {
1255             extern __shared__ int s_OHist[];
1256             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1257                 s_OHist[i] = 0;
1258             __syncthreads();
1259
1260             const int tIdx = blockIdx.x;
1261             const int level = blockIdx.y;
1262
1263             const int tSize = templSizes[level];
1264
1265             if (tIdx < tSize)
1266             {
1267                 const int imSize = imageSizes[level];
1268
1269                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
1270
1271                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1272                 {
1273                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1274
1275                     const float angle = clampAngle(im_p1_theta - t_p1_theta);
1276
1277                     if (angle >= minAngle && angle <= maxAngle)
1278                     {
1279                         const int n = __float2int_rn((angle - minAngle) * iAngleStep);
1280                         Emulation::smem::atomicAdd(&s_OHist[n], 1);
1281                     }
1282                 }
1283             }
1284             __syncthreads();
1285
1286             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1287                 ::atomicAdd(OHist + i, s_OHist[i]);
1288         }
1289
1290         void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
1291                                          float minAngle, float maxAngle, float angleStep, int angleRange,
1292                                          int levels, int tMaxSize)
1293         {
1294             const dim3 block(256);
1295             const dim3 grid(tMaxSize, levels + 1);
1296
1297             minAngle *= (CV_PI_F / 180.0f);
1298             maxAngle *= (CV_PI_F / 180.0f);
1299             angleStep *= (CV_PI_F / 180.0f);
1300
1301             const size_t smemSize = (angleRange + 1) * sizeof(float);
1302
1303             GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
1304                                                                minAngle, maxAngle, 1.0f / angleStep, angleRange);
1305             cudaSafeCall( cudaGetLastError() );
1306
1307             cudaSafeCall( cudaDeviceSynchronize() );
1308         }
1309
1310         __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
1311                                                 const float angle, const float angleEpsilon,
1312                                                 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
1313         {
1314             extern __shared__ int s_SHist[];
1315             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1316                 s_SHist[i] = 0;
1317             __syncthreads();
1318
1319             const int tIdx = blockIdx.x;
1320             const int level = blockIdx.y;
1321
1322             const int tSize = templSizes[level];
1323
1324             if (tIdx < tSize)
1325             {
1326                 const int imSize = imageSizes[level];
1327
1328                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1329                 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
1330
1331                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1332                 {
1333                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1334                     const float im_d12 = ImageFeatureTable::d12(level)[i];
1335
1336                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1337                     {
1338                         const float scale = im_d12 / t_d12;
1339
1340                         if (scale >= minScale && scale <= maxScale)
1341                         {
1342                             const int s = __float2int_rn((scale - minScale) * iScaleStep);
1343                             Emulation::smem::atomicAdd(&s_SHist[s], 1);
1344                         }
1345                     }
1346                 }
1347             }
1348             __syncthreads();
1349
1350             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1351                 ::atomicAdd(SHist + i, s_SHist[i]);
1352         }
1353
1354         void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
1355                                          float angle, float angleEpsilon,
1356                                          float minScale, float maxScale, float iScaleStep, int scaleRange,
1357                                          int levels, int tMaxSize)
1358         {
1359             const dim3 block(256);
1360             const dim3 grid(tMaxSize, levels + 1);
1361
1362             angle *= (CV_PI_F / 180.0f);
1363             angleEpsilon *= (CV_PI_F / 180.0f);
1364
1365             const size_t smemSize = (scaleRange + 1) * sizeof(float);
1366
1367             GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
1368                                                                angle, angleEpsilon,
1369                                                                minScale, maxScale, iScaleStep, scaleRange);
1370             cudaSafeCall( cudaGetLastError() );
1371
1372             cudaSafeCall( cudaDeviceSynchronize() );
1373         }
1374
1375         __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1376                                                 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
1377                                                 const float idp)
1378         {
1379             const int tIdx = blockIdx.x;
1380             const int level = blockIdx.y;
1381
1382             const int tSize = templSizes[level];
1383
1384             if (tIdx < tSize)
1385             {
1386                 const int imSize = imageSizes[level];
1387
1388                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1389
1390                 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
1391                 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
1392
1393                 r1 = r1 * scale;
1394                 r2 = r2 * scale;
1395
1396                 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
1397                 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
1398
1399                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1400                 {
1401                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1402
1403                     const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
1404                     const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
1405
1406                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1407                     {
1408                         float2 c1, c2;
1409
1410                         c1 = im_p1_pos - r1;
1411                         c1 = c1 * idp;
1412
1413                         c2 = im_p2_pos - r2;
1414                         c2 = c2 * idp;
1415
1416                         if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
1417                             continue;
1418
1419                         if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
1420                             ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
1421                     }
1422                 }
1423             }
1424         }
1425
1426         void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1427                                          float angle, float angleEpsilon, float scale,
1428                                          float dp,
1429                                          int levels, int tMaxSize)
1430         {
1431             const dim3 block(256);
1432             const dim3 grid(tMaxSize, levels + 1);
1433
1434             angle *= (CV_PI_F / 180.0f);
1435             angleEpsilon *= (CV_PI_F / 180.0f);
1436
1437             const float sinVal = ::sinf(angle);
1438             const float cosVal = ::cosf(angle);
1439
1440             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
1441
1442             GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
1443                                                      angle, sinVal, cosVal, angleEpsilon, scale,
1444                                                      1.0f / dp);
1445             cudaSafeCall( cudaGetLastError() );
1446
1447             cudaSafeCall( cudaDeviceSynchronize() );
1448         }
1449
1450         __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
1451                                                     const float angle, const int angleVotes, const float scale, const int scaleVotes,
1452                                                     const float dp, const int threshold)
1453         {
1454             const int x = blockIdx.x * blockDim.x + threadIdx.x;
1455             const int y = blockIdx.y * blockDim.y + threadIdx.y;
1456
1457             if (x >= hist.cols - 2 || y >= hist.rows - 2)
1458                 return;
1459
1460             const int curVotes = hist(y + 1, x + 1);
1461
1462             if (curVotes > threshold &&
1463                 curVotes >  hist(y + 1, x) &&
1464                 curVotes >= hist(y + 1, x + 2) &&
1465                 curVotes >  hist(y, x + 1) &&
1466                 curVotes >= hist(y + 2, x + 1))
1467             {
1468                 const int ind = ::atomicAdd(&g_counter, 1);
1469
1470                 if (ind < maxSize)
1471                 {
1472                     out[ind] = make_float4(x * dp, y * dp, scale, angle);
1473                     votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
1474                 }
1475             }
1476         }
1477
1478         int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
1479                                              float angle, int angleVotes, float scale, int scaleVotes,
1480                                              float dp, int threshold)
1481         {
1482             void* counterPtr;
1483             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
1484
1485             cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
1486
1487             const dim3 block(32, 8);
1488             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
1489
1490             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
1491
1492             GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
1493                                                          angle, angleVotes, scale, scaleVotes,
1494                                                          dp, threshold);
1495             cudaSafeCall( cudaGetLastError() );
1496
1497             cudaSafeCall( cudaDeviceSynchronize() );
1498
1499             int totalCount;
1500             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
1501
1502             totalCount = ::min(totalCount, maxSize);
1503
1504             return totalCount;
1505         }
1506     }
1507 }}}