modules/gpu/src/cuda/hough.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include <thrust/device_ptr.h>
  46 #include <thrust/sort.h>
  47
  48 #include "opencv2/gpu/device/common.hpp"
  49 #include "opencv2/gpu/device/emulation.hpp"
  50 #include "opencv2/gpu/device/vec_math.hpp"
  51 #include "opencv2/gpu/device/limits.hpp"
  52 #include "opencv2/gpu/device/dynamic_smem.hpp"
  53
  54 namespace cv { namespace gpu { namespace device
  55 {
  56     namespace hough
  57     {
  58         __device__ int g_counter;
  59
  60         ////////////////////////////////////////////////////////////////////////
  61         // buildPointList
  62
  63         template <int PIXELS_PER_THREAD>
  64         __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
  65         {
  66             __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
  67             __shared__ int s_qsize[4];
  68             __shared__ int s_globStart[4];
  69
  70             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
  71             const int y = blockIdx.y * blockDim.y + threadIdx.y;
  72
  73             if (threadIdx.x == 0)
  74                 s_qsize[threadIdx.y] = 0;
  75             __syncthreads();
  76
  77             if (y < src.rows)
  78             {
  79                 // fill the queue
  80                 const uchar* srcRow = src.ptr(y);
  81                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
  82                 {
  83                     if (srcRow[xx])
  84                     {
  85                         const unsigned int val = (y << 16) | xx;
  86                         const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
  87                         s_queues[threadIdx.y][qidx] = val;
  88                     }
  89                 }
  90             }
  91
  92             __syncthreads();
  93
  94             // let one thread reserve the space required in the global list
  95             if (threadIdx.x == 0 && threadIdx.y == 0)
  96             {
  97                 // find how many items are stored in each list
  98                 int totalSize = 0;
  99                 for (int i = 0; i < blockDim.y; ++i)
 100                 {
 101                     s_globStart[i] = totalSize;
 102                     totalSize += s_qsize[i];
 103                 }
 104
 105                 // calculate the offset in the global list
 106                 const int globalOffset = atomicAdd(&g_counter, totalSize);
 107                 for (int i = 0; i < blockDim.y; ++i)
 108                     s_globStart[i] += globalOffset;
 109             }
 110
 111             __syncthreads();
 112
 113             // copy local queues to global queue
 114             const int qsize = s_qsize[threadIdx.y];
 115             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
 116             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
 117                 list[gidx] = s_queues[threadIdx.y][i];
 118         }
 119
 120         int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
 121         {
 122             const int PIXELS_PER_THREAD = 16;
 123
 124             void* counterPtr;
 125             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 126
 127             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 128
 129             const dim3 block(32, 4);
 130             const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
 131
 132             cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
 133
 134             buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
 135             cudaSafeCall( cudaGetLastError() );
 136
 137             cudaSafeCall( cudaDeviceSynchronize() );
 138
 139             int totalCount;
 140             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 141
 142             return totalCount;
 143         }
 144
 145         ////////////////////////////////////////////////////////////////////////
 146         // linesAccum
 147
 148         __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
 149         {
 150             const int n = blockIdx.x;
 151             const float ang = n * theta;
 152
 153             float sinVal;
 154             float cosVal;
 155             sincosf(ang, &sinVal, &cosVal);
 156             sinVal *= irho;
 157             cosVal *= irho;
 158
 159             const int shift = (numrho - 1) / 2;
 160
 161             int* accumRow = accum.ptr(n + 1);
 162             for (int i = threadIdx.x; i < count; i += blockDim.x)
 163             {
 164                 const unsigned int val = list[i];
 165
 166                 const int x = (val & 0xFFFF);
 167                 const int y = (val >> 16) & 0xFFFF;
 168
 169                 int r = __float2int_rn(x * cosVal + y * sinVal);
 170                 r += shift;
 171
 172                 ::atomicAdd(accumRow + r + 1, 1);
 173             }
 174         }
 175
 176         __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
 177         {
 178             int* smem = DynamicSharedMem<int>();
 179
 180             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
 181                 smem[i] = 0;
 182
 183             __syncthreads();
 184
 185             const int n = blockIdx.x;
 186             const float ang = n * theta;
 187
 188             float sinVal;
 189             float cosVal;
 190             sincosf(ang, &sinVal, &cosVal);
 191             sinVal *= irho;
 192             cosVal *= irho;
 193
 194             const int shift = (numrho - 1) / 2;
 195
 196             for (int i = threadIdx.x; i < count; i += blockDim.x)
 197             {
 198                 const unsigned int val = list[i];
 199
 200                 const int x = (val & 0xFFFF);
 201                 const int y = (val >> 16) & 0xFFFF;
 202
 203                 int r = __float2int_rn(x * cosVal + y * sinVal);
 204                 r += shift;
 205
 206                 Emulation::smem::atomicAdd(&smem[r + 1], 1);
 207             }
 208
 209             __syncthreads();
 210
 211             int* accumRow = accum.ptr(n + 1);
 212             for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
 213                 accumRow[i] = smem[i];
 214         }
 215
 216         void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
 217         {
 218             const dim3 block(has20 ? 1024 : 512);
 219             const dim3 grid(accum.rows - 2);
 220
 221             size_t smemSize = (accum.cols - 1) * sizeof(int);
 222
 223             if (smemSize < sharedMemPerBlock - 1000)
 224                 linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
 225             else
 226                 linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
 227
 228             cudaSafeCall( cudaGetLastError() );
 229
 230             cudaSafeCall( cudaDeviceSynchronize() );
 231         }
 232
 233         ////////////////////////////////////////////////////////////////////////
 234         // linesGetResult
 235
 236         __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
 237         {
 238             const int r = blockIdx.x * blockDim.x + threadIdx.x;
 239             const int n = blockIdx.y * blockDim.y + threadIdx.y;
 240
 241             if (r >= accum.cols - 2 || n >= accum.rows - 2)
 242                 return;
 243
 244             const int curVotes = accum(n + 1, r + 1);
 245
 246             if (curVotes > threshold &&
 247                 curVotes >  accum(n + 1, r) &&
 248                 curVotes >= accum(n + 1, r + 2) &&
 249                 curVotes >  accum(n, r + 1) &&
 250                 curVotes >= accum(n + 2, r + 1))
 251             {
 252                 const float radius = (r - (numrho - 1) * 0.5f) * rho;
 253                 const float angle = n * theta;
 254
 255                 const int ind = ::atomicAdd(&g_counter, 1);
 256                 if (ind < maxSize)
 257                 {
 258                     out[ind] = make_float2(radius, angle);
 259                     votes[ind] = curVotes;
 260                 }
 261             }
 262         }
 263
 264         int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
 265         {
 266             void* counterPtr;
 267             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 268
 269             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 270
 271             const dim3 block(32, 8);
 272             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 273
 274             cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
 275
 276             linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
 277             cudaSafeCall( cudaGetLastError() );
 278
 279             cudaSafeCall( cudaDeviceSynchronize() );
 280
 281             int totalCount;
 282             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 283
 284             totalCount = ::min(totalCount, maxSize);
 285
 286             if (doSort && totalCount > 0)
 287             {
 288                 thrust::device_ptr<float2> outPtr(out);
 289                 thrust::device_ptr<int> votesPtr(votes);
 290                 thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
 291             }
 292
 293             return totalCount;
 294         }
 295
 296         ////////////////////////////////////////////////////////////////////////
 297         // circlesAccumCenters
 298
 299         __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
 300                                             PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
 301         {
 302             const int SHIFT = 10;
 303             const int ONE = 1 << SHIFT;
 304
 305             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 306
 307             if (tid >= count)
 308                 return;
 309
 310             const unsigned int val = list[tid];
 311
 312             const int x = (val & 0xFFFF);
 313             const int y = (val >> 16) & 0xFFFF;
 314
 315             const int vx = dx(y, x);
 316             const int vy = dy(y, x);
 317
 318             if (vx == 0 && vy == 0)
 319                 return;
 320
 321             const float mag = ::sqrtf(vx * vx + vy * vy);
 322
 323             const int x0 = __float2int_rn((x * idp) * ONE);
 324             const int y0 = __float2int_rn((y * idp) * ONE);
 325
 326             int sx = __float2int_rn((vx * idp) * ONE / mag);
 327             int sy = __float2int_rn((vy * idp) * ONE / mag);
 328
 329             // Step from minRadius to maxRadius in both directions of the gradient
 330             for (int k1 = 0; k1 < 2; ++k1)
 331             {
 332                 int x1 = x0 + minRadius * sx;
 333                 int y1 = y0 + minRadius * sy;
 334
 335                 for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
 336                 {
 337                     const int x2 = x1 >> SHIFT;
 338                     const int y2 = y1 >> SHIFT;
 339
 340                     if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
 341                         break;
 342
 343                     ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
 344                 }
 345
 346                 sx = -sx;
 347                 sy = -sy;
 348             }
 349         }
 350
 351         void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
 352         {
 353             const dim3 block(256);
 354             const dim3 grid(divUp(count, block.x));
 355
 356             cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
 357
 358             circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
 359             cudaSafeCall( cudaGetLastError() );
 360
 361             cudaSafeCall( cudaDeviceSynchronize() );
 362         }
 363
 364         ////////////////////////////////////////////////////////////////////////
 365         // buildCentersList
 366
 367         __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
 368         {
 369             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 370             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 371
 372             if (x < accum.cols - 2 && y < accum.rows - 2)
 373             {
 374                 const int top = accum(y, x + 1);
 375
 376                 const int left = accum(y + 1, x);
 377                 const int cur = accum(y + 1, x + 1);
 378                 const int right = accum(y + 1, x + 2);
 379
 380                 const int bottom = accum(y + 2, x + 1);
 381
 382                 if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
 383                 {
 384                     const unsigned int val = (y << 16) | x;
 385                     const int idx = ::atomicAdd(&g_counter, 1);
 386                     centers[idx] = val;
 387                 }
 388             }
 389         }
 390
 391         int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
 392         {
 393             void* counterPtr;
 394             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 395
 396             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 397
 398             const dim3 block(32, 8);
 399             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
 400
 401             cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
 402
 403             buildCentersList<<<grid, block>>>(accum, centers, threshold);
 404             cudaSafeCall( cudaGetLastError() );
 405
 406             cudaSafeCall( cudaDeviceSynchronize() );
 407
 408             int totalCount;
 409             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 410
 411             return totalCount;
 412         }
 413
 414         ////////////////////////////////////////////////////////////////////////
 415         // circlesAccumRadius
 416
 417         __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
 418                                            float3* circles, const int maxCircles, const float dp,
 419                                            const int minRadius, const int maxRadius, const int histSize, const int threshold)
 420         {
 421             int* smem = DynamicSharedMem<int>();
 422
 423             for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
 424                 smem[i] = 0;
 425             __syncthreads();
 426
 427             unsigned int val = centers[blockIdx.x];
 428
 429             float cx = (val & 0xFFFF);
 430             float cy = (val >> 16) & 0xFFFF;
 431
 432             cx = (cx + 0.5f) * dp;
 433             cy = (cy + 0.5f) * dp;
 434
 435             for (int i = threadIdx.x; i < count; i += blockDim.x)
 436             {
 437                 val = list[i];
 438
 439                 const int x = (val & 0xFFFF);
 440                 const int y = (val >> 16) & 0xFFFF;
 441
 442                 const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
 443                 if (rad >= minRadius && rad <= maxRadius)
 444                 {
 445                     const int r = __float2int_rn(rad - minRadius);
 446
 447                     Emulation::smem::atomicAdd(&smem[r + 1], 1);
 448                 }
 449             }
 450
 451             __syncthreads();
 452
 453             for (int i = threadIdx.x; i < histSize; i += blockDim.x)
 454             {
 455                 const int curVotes = smem[i + 1];
 456
 457                 if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
 458                 {
 459                     const int ind = ::atomicAdd(&g_counter, 1);
 460                     if (ind < maxCircles)
 461                         circles[ind] = make_float3(cx, cy, i + minRadius);
 462                 }
 463             }
 464         }
 465
 466         int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
 467                                    float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
 468         {
 469             void* counterPtr;
 470             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 471
 472             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 473
 474             const dim3 block(has20 ? 1024 : 512);
 475             const dim3 grid(centersCount);
 476
 477             const int histSize = maxRadius - minRadius + 1;
 478             size_t smemSize = (histSize + 2) * sizeof(int);
 479
 480             circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
 481             cudaSafeCall( cudaGetLastError() );
 482
 483             cudaSafeCall( cudaDeviceSynchronize() );
 484
 485             int totalCount;
 486             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 487
 488             totalCount = ::min(totalCount, maxCircles);
 489
 490             return totalCount;
 491         }
 492
 493         ////////////////////////////////////////////////////////////////////////
 494         // Generalized Hough
 495
 496         template <typename T, int PIXELS_PER_THREAD>
 497         __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy, unsigned int* coordList, float* thetaList)
 498         {
 499             __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
 500             __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
 501             __shared__ int s_sizes[4];
 502             __shared__ int s_globStart[4];
 503
 504             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
 505             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 506
 507             if (threadIdx.x == 0)
 508                 s_sizes[threadIdx.y] = 0;
 509             __syncthreads();
 510
 511             if (y < edges.rows)
 512             {
 513                 // fill the queue
 514                 const uchar* edgesRow = edges.ptr(y);
 515                 const T* dxRow = dx.ptr(y);
 516                 const T* dyRow = dy.ptr(y);
 517
 518                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
 519                 {
 520                     const T dxVal = dxRow[xx];
 521                     const T dyVal = dyRow[xx];
 522
 523                     if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
 524                     {
 525                         const unsigned int coord = (y << 16) | xx;
 526
 527                         float theta = ::atan2f(dyVal, dxVal);
 528                         if (theta < 0)
 529                             theta += 2.0f * CV_PI_F;
 530
 531                         const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
 532
 533                         s_coordLists[threadIdx.y][qidx] = coord;
 534                         s_thetaLists[threadIdx.y][qidx] = theta;
 535                     }
 536                 }
 537             }
 538
 539             __syncthreads();
 540
 541             // let one thread reserve the space required in the global list
 542             if (threadIdx.x == 0 && threadIdx.y == 0)
 543             {
 544                 // find how many items are stored in each list
 545                 int totalSize = 0;
 546                 for (int i = 0; i < blockDim.y; ++i)
 547                 {
 548                     s_globStart[i] = totalSize;
 549                     totalSize += s_sizes[i];
 550                 }
 551
 552                 // calculate the offset in the global list
 553                 const int globalOffset = atomicAdd(&g_counter, totalSize);
 554                 for (int i = 0; i < blockDim.y; ++i)
 555                     s_globStart[i] += globalOffset;
 556             }
 557
 558             __syncthreads();
 559
 560             // copy local queues to global queue
 561             const int qsize = s_sizes[threadIdx.y];
 562             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
 563             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
 564             {
 565                 coordList[gidx] = s_coordLists[threadIdx.y][i];
 566                 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
 567             }
 568         }
 569
 570         template <typename T>
 571         int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
 572         {
 573             const int PIXELS_PER_THREAD = 8;
 574
 575             void* counterPtr;
 576             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 577
 578             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 579
 580             const dim3 block(32, 4);
 581             const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
 582
 583             cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
 584
 585             buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
 586             cudaSafeCall( cudaGetLastError() );
 587
 588             cudaSafeCall( cudaDeviceSynchronize() );
 589
 590             int totalCount;
 591             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 592
 593             return totalCount;
 594         }
 595
 596         template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
 597         template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
 598         template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
 599
 600         __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
 601                                     PtrStep<short2> r_table, int* r_sizes, int maxSize,
 602                                     const short2 templCenter, const float thetaScale)
 603         {
 604             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 605
 606             if (tid >= pointsCount)
 607                 return;
 608
 609             const unsigned int coord = coordList[tid];
 610             short2 p;
 611             p.x = (coord & 0xFFFF);
 612             p.y = (coord >> 16) & 0xFFFF;
 613
 614             const float theta = thetaList[tid];
 615             const int n = __float2int_rn(theta * thetaScale);
 616
 617             const int ind = ::atomicAdd(r_sizes + n, 1);
 618             if (ind < maxSize)
 619                 r_table(n, ind) = p - templCenter;
 620         }
 621
 622         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 623                              PtrStepSz<short2> r_table, int* r_sizes,
 624                              short2 templCenter, int levels)
 625         {
 626             const dim3 block(256);
 627             const dim3 grid(divUp(pointsCount, block.x));
 628
 629             const float thetaScale = levels / (2.0f * CV_PI_F);
 630
 631             buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
 632             cudaSafeCall( cudaGetLastError() );
 633
 634             cudaSafeCall( cudaDeviceSynchronize() );
 635         }
 636
 637         ////////////////////////////////////////////////////////////////////////
 638         // GHT_Ballard_Pos
 639
 640         __global__ void GHT_Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
 641                                                  const PtrStep<short2> r_table, const int* r_sizes,
 642                                                  PtrStepSzi hist,
 643                                                  const float idp, const float thetaScale)
 644         {
 645             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 646
 647             if (tid >= pointsCount)
 648                 return;
 649
 650             const unsigned int coord = coordList[tid];
 651             short2 p;
 652             p.x = (coord & 0xFFFF);
 653             p.y = (coord >> 16) & 0xFFFF;
 654
 655             const float theta = thetaList[tid];
 656             const int n = __float2int_rn(theta * thetaScale);
 657
 658             const short2* r_row = r_table.ptr(n);
 659             const int r_row_size = r_sizes[n];
 660
 661             for (int j = 0; j < r_row_size; ++j)
 662             {
 663                 short2 c = p - r_row[j];
 664
 665                 c.x = __float2int_rn(c.x * idp);
 666                 c.y = __float2int_rn(c.y * idp);
 667
 668                 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
 669                     ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
 670             }
 671         }
 672
 673         void GHT_Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 674                                           PtrStepSz<short2> r_table, const int* r_sizes,
 675                                           PtrStepSzi hist,
 676                                           float dp, int levels)
 677         {
 678             const dim3 block(256);
 679             const dim3 grid(divUp(pointsCount, block.x));
 680
 681             const float idp = 1.0f / dp;
 682             const float thetaScale = levels / (2.0f * CV_PI_F);
 683
 684             GHT_Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
 685             cudaSafeCall( cudaGetLastError() );
 686
 687             cudaSafeCall( cudaDeviceSynchronize() );
 688         }
 689
 690         __global__ void GHT_Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize, const float dp, const int threshold)
 691         {
 692             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 693             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 694
 695             if (x >= hist.cols - 2 || y >= hist.rows - 2)
 696                 return;
 697
 698             const int curVotes = hist(y + 1, x + 1);
 699
 700             if (curVotes > threshold &&
 701                 curVotes >  hist(y + 1, x) &&
 702                 curVotes >= hist(y + 1, x + 2) &&
 703                 curVotes >  hist(y, x + 1) &&
 704                 curVotes >= hist(y + 2, x + 1))
 705             {
 706                 const int ind = ::atomicAdd(&g_counter, 1);
 707
 708                 if (ind < maxSize)
 709                 {
 710                     out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
 711                     votes[ind] = make_int3(curVotes, 0, 0);
 712                 }
 713             }
 714         }
 715
 716         int GHT_Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
 717         {
 718             void* counterPtr;
 719             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 720
 721             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 722
 723             const dim3 block(32, 8);
 724             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
 725
 726             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
 727
 728             GHT_Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
 729             cudaSafeCall( cudaGetLastError() );
 730
 731             cudaSafeCall( cudaDeviceSynchronize() );
 732
 733             int totalCount;
 734             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 735
 736             totalCount = ::min(totalCount, maxSize);
 737
 738             return totalCount;
 739         }
 740
 741         ////////////////////////////////////////////////////////////////////////
 742         // GHT_Ballard_PosScale
 743
 744         __global__ void GHT_Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
 745                                                       PtrStep<short2> r_table, const int* r_sizes,
 746                                                       PtrStepi hist, const int rows, const int cols,
 747                                                       const float minScale, const float scaleStep, const int scaleRange,
 748                                                       const float idp, const float thetaScale)
 749         {
 750             const unsigned int coord = coordList[blockIdx.x];
 751             float2 p;
 752             p.x = (coord & 0xFFFF);
 753             p.y = (coord >> 16) & 0xFFFF;
 754
 755             const float theta = thetaList[blockIdx.x];
 756             const int n = __float2int_rn(theta * thetaScale);
 757
 758             const short2* r_row = r_table.ptr(n);
 759             const int r_row_size = r_sizes[n];
 760
 761             for (int j = 0; j < r_row_size; ++j)
 762             {
 763                 const float2 d = saturate_cast<float2>(r_row[j]);
 764
 765                 for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
 766                 {
 767                     const float scale = minScale + s * scaleStep;
 768
 769                     float2 c = p - scale * d;
 770
 771                     c.x *= idp;
 772                     c.y *= idp;
 773
 774                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
 775                         ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
 776                 }
 777             }
 778         }
 779
 780         void GHT_Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 781                                                PtrStepSz<short2> r_table, const int* r_sizes,
 782                                                PtrStepi hist, int rows, int cols,
 783                                                float minScale, float scaleStep, int scaleRange,
 784                                                float dp, int levels)
 785         {
 786             const dim3 block(256);
 787             const dim3 grid(pointsCount);
 788
 789             const float idp = 1.0f / dp;
 790             const float thetaScale = levels / (2.0f * CV_PI_F);
 791
 792             GHT_Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
 793                                                            r_table, r_sizes,
 794                                                            hist, rows, cols,
 795                                                            minScale, scaleStep, scaleRange,
 796                                                            idp, thetaScale);
 797             cudaSafeCall( cudaGetLastError() );
 798
 799             cudaSafeCall( cudaDeviceSynchronize() );
 800         }
 801
 802         __global__ void GHT_Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
 803                                                            float4* out, int3* votes, const int maxSize,
 804                                                            const float minScale, const float scaleStep, const float dp, const int threshold)
 805         {
 806             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 807             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 808
 809             if (x >= cols || y >= rows)
 810                 return;
 811
 812             for (int s = 0; s < scaleRange; ++s)
 813             {
 814                 const float scale = minScale + s * scaleStep;
 815
 816                 const int prevScaleIdx = (s) * (rows + 2);
 817                 const int curScaleIdx = (s + 1) * (rows + 2);
 818                 const int nextScaleIdx = (s + 2) * (rows + 2);
 819
 820                 const int curVotes = hist(curScaleIdx + y + 1, x + 1);
 821
 822                 if (curVotes > threshold &&
 823                     curVotes >  hist(curScaleIdx + y + 1, x) &&
 824                     curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
 825                     curVotes >  hist(curScaleIdx + y, x + 1) &&
 826                     curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
 827                     curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
 828                     curVotes >= hist(nextScaleIdx + y + 1, x + 1))
 829                 {
 830                     const int ind = ::atomicAdd(&g_counter, 1);
 831
 832                     if (ind < maxSize)
 833                     {
 834                         out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
 835                         votes[ind] = make_int3(curVotes, curVotes, 0);
 836                     }
 837                 }
 838             }
 839         }
 840
 841         int GHT_Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
 842                                                    float minScale, float scaleStep, float dp, int threshold)
 843         {
 844             void* counterPtr;
 845             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 846
 847             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 848
 849             const dim3 block(32, 8);
 850             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
 851
 852             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
 853
 854             GHT_Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes, maxSize, minScale, scaleStep, dp, threshold);
 855             cudaSafeCall( cudaGetLastError() );
 856
 857             cudaSafeCall( cudaDeviceSynchronize() );
 858
 859             int totalCount;
 860             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 861
 862             totalCount = ::min(totalCount, maxSize);
 863
 864             return totalCount;
 865         }
 866
 867         ////////////////////////////////////////////////////////////////////////
 868         // GHT_Ballard_PosRotation
 869
 870         __global__ void GHT_Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
 871                                                          PtrStep<short2> r_table, const int* r_sizes,
 872                                                          PtrStepi hist, const int rows, const int cols,
 873                                                          const float minAngle, const float angleStep, const int angleRange,
 874                                                          const float idp, const float thetaScale)
 875         {
 876             const unsigned int coord = coordList[blockIdx.x];
 877             float2 p;
 878             p.x = (coord & 0xFFFF);
 879             p.y = (coord >> 16) & 0xFFFF;
 880
 881             const float thetaVal = thetaList[blockIdx.x];
 882
 883             for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
 884             {
 885                 const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
 886                 float sinA, cosA;
 887                 sincosf(angle, &sinA, &cosA);
 888
 889                 float theta = thetaVal - angle;
 890                 if (theta < 0)
 891                     theta += 2.0f * CV_PI_F;
 892
 893                 const int n = __float2int_rn(theta * thetaScale);
 894
 895                 const short2* r_row = r_table.ptr(n);
 896                 const int r_row_size = r_sizes[n];
 897
 898                 for (int j = 0; j < r_row_size; ++j)
 899                 {
 900                     const float2 d = saturate_cast<float2>(r_row[j]);
 901
 902                     const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
 903
 904                     float2 c = make_float2(p.x - dr.x, p.y - dr.y);
 905                     c.x *= idp;
 906                     c.y *= idp;
 907
 908                     if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
 909                         ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
 910                 }
 911             }
 912         }
 913
 914         void GHT_Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
 915                                                   PtrStepSz<short2> r_table, const int* r_sizes,
 916                                                   PtrStepi hist, int rows, int cols,
 917                                                   float minAngle, float angleStep, int angleRange,
 918                                                   float dp, int levels)
 919         {
 920             const dim3 block(256);
 921             const dim3 grid(pointsCount);
 922
 923             const float idp = 1.0f / dp;
 924             const float thetaScale = levels / (2.0f * CV_PI_F);
 925
 926             GHT_Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
 927                                                               r_table, r_sizes,
 928                                                               hist, rows, cols,
 929                                                               minAngle, angleStep, angleRange,
 930                                                               idp, thetaScale);
 931             cudaSafeCall( cudaGetLastError() );
 932
 933             cudaSafeCall( cudaDeviceSynchronize() );
 934         }
 935
 936         __global__ void GHT_Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
 937                                                               float4* out, int3* votes, const int maxSize,
 938                                                               const float minAngle, const float angleStep, const float dp, const int threshold)
 939         {
 940             const int x = blockIdx.x * blockDim.x + threadIdx.x;
 941             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 942
 943             if (x >= cols || y >= rows)
 944                 return;
 945
 946             for (int a = 0; a < angleRange; ++a)
 947             {
 948                 const float angle = minAngle + a * angleStep;
 949
 950                 const int prevAngleIdx = (a) * (rows + 2);
 951                 const int curAngleIdx = (a + 1) * (rows + 2);
 952                 const int nextAngleIdx = (a + 2) * (rows + 2);
 953
 954                 const int curVotes = hist(curAngleIdx + y + 1, x + 1);
 955
 956                 if (curVotes > threshold &&
 957                     curVotes >  hist(curAngleIdx + y + 1, x) &&
 958                     curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
 959                     curVotes >  hist(curAngleIdx + y, x + 1) &&
 960                     curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
 961                     curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
 962                     curVotes >= hist(nextAngleIdx + y + 1, x + 1))
 963                 {
 964                     const int ind = ::atomicAdd(&g_counter, 1);
 965
 966                     if (ind < maxSize)
 967                     {
 968                         out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
 969                         votes[ind] = make_int3(curVotes, 0, curVotes);
 970                     }
 971                 }
 972             }
 973         }
 974
 975         int GHT_Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
 976                                                       float minAngle, float angleStep, float dp, int threshold)
 977         {
 978             void* counterPtr;
 979             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
 980
 981             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
 982
 983             const dim3 block(32, 8);
 984             const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
 985
 986             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
 987
 988             GHT_Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes, maxSize, minAngle, angleStep, dp, threshold);
 989             cudaSafeCall( cudaGetLastError() );
 990
 991             cudaSafeCall( cudaDeviceSynchronize() );
 992
 993             int totalCount;
 994             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
 995
 996             totalCount = ::min(totalCount, maxSize);
 997
 998             return totalCount;
 999         }
1000
1001         ////////////////////////////////////////////////////////////////////////
1002         // GHT_Guil_Full
1003
1004         struct FeatureTable
1005         {
1006             uchar* p1_pos_data;
1007             size_t p1_pos_step;
1008
1009             uchar* p1_theta_data;
1010             size_t p1_theta_step;
1011
1012             uchar* p2_pos_data;
1013             size_t p2_pos_step;
1014
1015             uchar* d12_data;
1016             size_t d12_step;
1017
1018             uchar* r1_data;
1019             size_t r1_step;
1020
1021             uchar* r2_data;
1022             size_t r2_step;
1023         };
1024
1025         __constant__ FeatureTable c_templFeatures;
1026         __constant__ FeatureTable c_imageFeatures;
1027
1028         void GHT_Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1029         {
1030             FeatureTable tbl;
1031
1032             tbl.p1_pos_data = p1_pos.data;
1033             tbl.p1_pos_step = p1_pos.step;
1034
1035             tbl.p1_theta_data = p1_theta.data;
1036             tbl.p1_theta_step = p1_theta.step;
1037
1038             tbl.p2_pos_data = p2_pos.data;
1039             tbl.p2_pos_step = p2_pos.step;
1040
1041             tbl.d12_data = d12.data;
1042             tbl.d12_step = d12.step;
1043
1044             tbl.r1_data = r1.data;
1045             tbl.r1_step = r1.step;
1046
1047             tbl.r2_data = r2.data;
1048             tbl.r2_step = r2.step;
1049
1050             cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
1051         }
1052         void GHT_Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
1053         {
1054             FeatureTable tbl;
1055
1056             tbl.p1_pos_data = p1_pos.data;
1057             tbl.p1_pos_step = p1_pos.step;
1058
1059             tbl.p1_theta_data = p1_theta.data;
1060             tbl.p1_theta_step = p1_theta.step;
1061
1062             tbl.p2_pos_data = p2_pos.data;
1063             tbl.p2_pos_step = p2_pos.step;
1064
1065             tbl.d12_data = d12.data;
1066             tbl.d12_step = d12.step;
1067
1068             tbl.r1_data = r1.data;
1069             tbl.r1_step = r1.step;
1070
1071             tbl.r2_data = r2.data;
1072             tbl.r2_step = r2.step;
1073
1074             cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
1075         }
1076
1077         struct TemplFeatureTable
1078         {
1079             static __device__ float2* p1_pos(int n)
1080             {
1081                 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
1082             }
1083             static __device__ float* p1_theta(int n)
1084             {
1085                 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
1086             }
1087             static __device__ float2* p2_pos(int n)
1088             {
1089                 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
1090             }
1091
1092             static __device__ float* d12(int n)
1093             {
1094                 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
1095             }
1096
1097             static __device__ float2* r1(int n)
1098             {
1099                 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
1100             }
1101             static __device__ float2* r2(int n)
1102             {
1103                 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
1104             }
1105         };
1106         struct ImageFeatureTable
1107         {
1108             static __device__ float2* p1_pos(int n)
1109             {
1110                 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
1111             }
1112             static __device__ float* p1_theta(int n)
1113             {
1114                 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
1115             }
1116             static __device__ float2* p2_pos(int n)
1117             {
1118                 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
1119             }
1120
1121             static __device__ float* d12(int n)
1122             {
1123                 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
1124             }
1125
1126             static __device__ float2* r1(int n)
1127             {
1128                 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
1129             }
1130             static __device__ float2* r2(int n)
1131             {
1132                 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
1133             }
1134         };
1135
1136         __device__ float clampAngle(float a)
1137         {
1138             float res = a;
1139
1140             while (res > 2.0f * CV_PI_F)
1141                 res -= 2.0f * CV_PI_F;
1142             while (res < 0.0f)
1143                 res += 2.0f * CV_PI_F;
1144
1145             return res;
1146         }
1147
1148         __device__ bool angleEq(float a, float b, float eps)
1149         {
1150             return (::fabs(clampAngle(a - b)) <= eps);
1151         }
1152
1153         template <class FT, bool isTempl>
1154         __global__ void GHT_Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
1155                                                        int* sizes, const int maxSize,
1156                                                        const float xi, const float angleEpsilon, const float alphaScale,
1157                                                        const float2 center, const float maxDist)
1158         {
1159             const float p1_theta = thetaList[blockIdx.x];
1160             const unsigned int coord1 = coordList[blockIdx.x];
1161             float2 p1_pos;
1162             p1_pos.x = (coord1 & 0xFFFF);
1163             p1_pos.y = (coord1 >> 16) & 0xFFFF;
1164
1165             for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
1166             {
1167                 const float p2_theta = thetaList[i];
1168                 const unsigned int coord2 = coordList[i];
1169                 float2 p2_pos;
1170                 p2_pos.x = (coord2 & 0xFFFF);
1171                 p2_pos.y = (coord2 >> 16) & 0xFFFF;
1172
1173                 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
1174                 {
1175                     const float2 d = p1_pos - p2_pos;
1176
1177                     float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
1178                     float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
1179
1180                     if (d12 > maxDist)
1181                         continue;
1182
1183                     float2 r1 = p1_pos - center;
1184                     float2 r2 = p2_pos - center;
1185
1186                     const int n = __float2int_rn(alpha12 * alphaScale);
1187
1188                     const int ind = ::atomicAdd(sizes + n, 1);
1189
1190                     if (ind < maxSize)
1191                     {
1192                         if (!isTempl)
1193                         {
1194                             FT::p1_pos(n)[ind] = p1_pos;
1195                             FT::p2_pos(n)[ind] = p2_pos;
1196                         }
1197
1198                         FT::p1_theta(n)[ind] = p1_theta;
1199
1200                         FT::d12(n)[ind] = d12;
1201
1202                         if (isTempl)
1203                         {
1204                             FT::r1(n)[ind] = r1;
1205                             FT::r2(n)[ind] = r2;
1206                         }
1207                     }
1208                 }
1209             }
1210         }
1211
1212         template <class FT, bool isTempl>
1213         void GHT_Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
1214                                                    int* sizes, int maxSize,
1215                                                    float xi, float angleEpsilon, int levels,
1216                                                    float2 center, float maxDist)
1217         {
1218             const dim3 block(256);
1219             const dim3 grid(pointsCount);
1220
1221             const float alphaScale = levels / (2.0f * CV_PI_F);
1222
1223             GHT_Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
1224                                                                          sizes, maxSize,
1225                                                                          xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
1226                                                                          center, maxDist);
1227             cudaSafeCall( cudaGetLastError() );
1228
1229             cudaSafeCall( cudaDeviceSynchronize() );
1230
1231             thrust::device_ptr<int> sizesPtr(sizes);
1232             thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
1233         }
1234
1235         void GHT_Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1236                                                      int* sizes, int maxSize,
1237                                                      float xi, float angleEpsilon, int levels,
1238                                                      float2 center, float maxDist)
1239         {
1240             GHT_Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
1241                                                                            sizes, maxSize,
1242                                                                            xi, angleEpsilon, levels,
1243                                                                            center, maxDist);
1244         }
1245         void GHT_Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
1246                                                      int* sizes, int maxSize,
1247                                                      float xi, float angleEpsilon, int levels,
1248                                                      float2 center, float maxDist)
1249         {
1250             GHT_Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
1251                                                                             sizes, maxSize,
1252                                                                             xi, angleEpsilon, levels,
1253                                                                             center, maxDist);
1254         }
1255
1256         __global__ void GHT_Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
1257                                                 const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
1258         {
1259             extern __shared__ int s_OHist[];
1260             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1261                 s_OHist[i] = 0;
1262             __syncthreads();
1263
1264             const int tIdx = blockIdx.x;
1265             const int level = blockIdx.y;
1266
1267             const int tSize = templSizes[level];
1268
1269             if (tIdx < tSize)
1270             {
1271                 const int imSize = imageSizes[level];
1272
1273                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
1274
1275                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1276                 {
1277                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1278
1279                     const float angle = clampAngle(im_p1_theta - t_p1_theta);
1280
1281                     if (angle >= minAngle && angle <= maxAngle)
1282                     {
1283                         const int n = __float2int_rn((angle - minAngle) * iAngleStep);
1284                         Emulation::smem::atomicAdd(&s_OHist[n], 1);
1285                     }
1286                 }
1287             }
1288             __syncthreads();
1289
1290             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
1291                 ::atomicAdd(OHist + i, s_OHist[i]);
1292         }
1293
1294         void GHT_Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
1295                                          float minAngle, float maxAngle, float angleStep, int angleRange,
1296                                          int levels, int tMaxSize)
1297         {
1298             const dim3 block(256);
1299             const dim3 grid(tMaxSize, levels + 1);
1300
1301             minAngle *= (CV_PI_F / 180.0f);
1302             maxAngle *= (CV_PI_F / 180.0f);
1303             angleStep *= (CV_PI_F / 180.0f);
1304
1305             const size_t smemSize = (angleRange + 1) * sizeof(float);
1306
1307             GHT_Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
1308                                                                minAngle, maxAngle, 1.0f / angleStep, angleRange);
1309             cudaSafeCall( cudaGetLastError() );
1310
1311             cudaSafeCall( cudaDeviceSynchronize() );
1312         }
1313
1314         __global__ void GHT_Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
1315                                                 const float angle, const float angleEpsilon,
1316                                                 const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
1317         {
1318             extern __shared__ int s_SHist[];
1319             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1320                 s_SHist[i] = 0;
1321             __syncthreads();
1322
1323             const int tIdx = blockIdx.x;
1324             const int level = blockIdx.y;
1325
1326             const int tSize = templSizes[level];
1327
1328             if (tIdx < tSize)
1329             {
1330                 const int imSize = imageSizes[level];
1331
1332                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1333                 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
1334
1335                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1336                 {
1337                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1338                     const float im_d12 = ImageFeatureTable::d12(level)[i];
1339
1340                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1341                     {
1342                         const float scale = im_d12 / t_d12;
1343
1344                         if (scale >= minScale && scale <= maxScale)
1345                         {
1346                             const int s = __float2int_rn((scale - minScale) * iScaleStep);
1347                             Emulation::smem::atomicAdd(&s_SHist[s], 1);
1348                         }
1349                     }
1350                 }
1351             }
1352             __syncthreads();
1353
1354             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
1355                 ::atomicAdd(SHist + i, s_SHist[i]);
1356         }
1357
1358         void GHT_Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
1359                                          float angle, float angleEpsilon,
1360                                          float minScale, float maxScale, float iScaleStep, int scaleRange,
1361                                          int levels, int tMaxSize)
1362         {
1363             const dim3 block(256);
1364             const dim3 grid(tMaxSize, levels + 1);
1365
1366             angle *= (CV_PI_F / 180.0f);
1367             angleEpsilon *= (CV_PI_F / 180.0f);
1368
1369             const size_t smemSize = (scaleRange + 1) * sizeof(float);
1370
1371             GHT_Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
1372                                                                angle, angleEpsilon,
1373                                                                minScale, maxScale, iScaleStep, scaleRange);
1374             cudaSafeCall( cudaGetLastError() );
1375
1376             cudaSafeCall( cudaDeviceSynchronize() );
1377         }
1378
1379         __global__ void GHT_Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1380                                                 const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
1381                                                 const float idp)
1382         {
1383             const int tIdx = blockIdx.x;
1384             const int level = blockIdx.y;
1385
1386             const int tSize = templSizes[level];
1387
1388             if (tIdx < tSize)
1389             {
1390                 const int imSize = imageSizes[level];
1391
1392                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
1393
1394                 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
1395                 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
1396
1397                 r1 = r1 * scale;
1398                 r2 = r2 * scale;
1399
1400                 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
1401                 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
1402
1403                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
1404                 {
1405                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
1406
1407                     const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
1408                     const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
1409
1410                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
1411                     {
1412                         float2 c1, c2;
1413
1414                         c1 = im_p1_pos - r1;
1415                         c1 = c1 * idp;
1416
1417                         c2 = im_p2_pos - r2;
1418                         c2 = c2 * idp;
1419
1420                         if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
1421                             continue;
1422
1423                         if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
1424                             ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
1425                     }
1426                 }
1427             }
1428         }
1429
1430         void GHT_Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
1431                                          float angle, float angleEpsilon, float scale,
1432                                          float dp,
1433                                          int levels, int tMaxSize)
1434         {
1435             const dim3 block(256);
1436             const dim3 grid(tMaxSize, levels + 1);
1437
1438             angle *= (CV_PI_F / 180.0f);
1439             angleEpsilon *= (CV_PI_F / 180.0f);
1440
1441             const float sinVal = ::sinf(angle);
1442             const float cosVal = ::cosf(angle);
1443
1444             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_calcPHist, cudaFuncCachePreferL1) );
1445
1446             GHT_Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
1447                                                      angle, sinVal, cosVal, angleEpsilon, scale,
1448                                                      1.0f / dp);
1449             cudaSafeCall( cudaGetLastError() );
1450
1451             cudaSafeCall( cudaDeviceSynchronize() );
1452         }
1453
1454         __global__ void GHT_Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
1455                                                     const float angle, const int angleVotes, const float scale, const int scaleVotes,
1456                                                     const float dp, const int threshold)
1457         {
1458             const int x = blockIdx.x * blockDim.x + threadIdx.x;
1459             const int y = blockIdx.y * blockDim.y + threadIdx.y;
1460
1461             if (x >= hist.cols - 2 || y >= hist.rows - 2)
1462                 return;
1463
1464             const int curVotes = hist(y + 1, x + 1);
1465
1466             if (curVotes > threshold &&
1467                 curVotes >  hist(y + 1, x) &&
1468                 curVotes >= hist(y + 1, x + 2) &&
1469                 curVotes >  hist(y, x + 1) &&
1470                 curVotes >= hist(y + 2, x + 1))
1471             {
1472                 const int ind = ::atomicAdd(&g_counter, 1);
1473
1474                 if (ind < maxSize)
1475                 {
1476                     out[ind] = make_float4(x * dp, y * dp, scale, angle);
1477                     votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
1478                 }
1479             }
1480         }
1481
1482         int GHT_Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
1483                                              float angle, int angleVotes, float scale, int scaleVotes,
1484                                              float dp, int threshold)
1485         {
1486             void* counterPtr;
1487             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
1488
1489             cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
1490
1491             const dim3 block(32, 8);
1492             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
1493
1494             cudaSafeCall( cudaFuncSetCacheConfig(GHT_Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
1495
1496             GHT_Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
1497                                                          angle, angleVotes, scale, scaleVotes,
1498                                                          dp, threshold);
1499             cudaSafeCall( cudaGetLastError() );
1500
1501             cudaSafeCall( cudaDeviceSynchronize() );
1502
1503             int totalCount;
1504             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
1505
1506             totalCount = ::min(totalCount, maxSize);
1507
1508             return totalCount;
1509         }
1510     }
1511 }}}
1512
1513
1514 #endif /* CUDA_DISABLER */