modules/gpu/src/cuda/surf.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
  42 //
  43 // The original code was written by Paul Furgale and Chi Hay Tong
  44 // and later optimized and prepared for integration into OpenCV by Itseez.
  45 //
  46 //M*/
  47
  48 #if !defined CUDA_DISABLER
  49
  50 #include "opencv2/gpu/device/common.hpp"
  51 #include "opencv2/gpu/device/limits.hpp"
  52 #include "opencv2/gpu/device/saturate_cast.hpp"
  53 #include "opencv2/gpu/device/reduce.hpp"
  54 #include "opencv2/gpu/device/utility.hpp"
  55 #include "opencv2/gpu/device/functional.hpp"
  56 #include "opencv2/gpu/device/filters.hpp"
  57
  58 namespace cv { namespace gpu { namespace device
  59 {
  60     namespace surf
  61     {
  62         ////////////////////////////////////////////////////////////////////////
  63         // Global parameters
  64
  65         // The maximum number of features (before subpixel interpolation) that memory is reserved for.
  66         __constant__ int c_max_candidates;
  67         // The maximum number of features that memory is reserved for.
  68         __constant__ int c_max_features;
  69         // The image size.
  70         __constant__ int c_img_rows;
  71         __constant__ int c_img_cols;
  72         // The number of layers.
  73         __constant__ int c_nOctaveLayers;
  74         // The hessian threshold.
  75         __constant__ float c_hessianThreshold;
  76
  77         // The current octave.
  78         __constant__ int c_octave;
  79         // The current layer size.
  80         __constant__ int c_layer_rows;
  81         __constant__ int c_layer_cols;
  82
  83         void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
  84         {
  85             cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
  86             cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
  87             cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
  88             cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
  89             cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
  90             cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
  91         }
  92
  93         void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
  94         {
  95             cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
  96             cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
  97             cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
  98         }
  99
 100         ////////////////////////////////////////////////////////////////////////
 101         // Integral image texture
 102
 103         texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 104         texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 105         texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 106
 107         void bindImgTex(PtrStepSzb img)
 108         {
 109             bindTexture(&imgTex, img);
 110         }
 111
 112         size_t bindSumTex(PtrStepSz<uint> sum)
 113         {
 114             size_t offset;
 115             cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
 116             cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
 117             return offset / sizeof(uint);
 118         }
 119         size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
 120         {
 121             size_t offset;
 122             cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
 123             cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
 124             return offset / sizeof(uint);
 125         }
 126
 127         template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
 128         {
 129         #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
 130             typedef double real_t;
 131         #else
 132             typedef float  real_t;
 133         #endif
 134
 135             float ratio = (float)newSize / oldSize;
 136
 137             real_t d = 0;
 138
 139             #pragma unroll
 140             for (int k = 0; k < N; ++k)
 141             {
 142                 int dx1 = __float2int_rn(ratio * src[k][0]);
 143                 int dy1 = __float2int_rn(ratio * src[k][1]);
 144                 int dx2 = __float2int_rn(ratio * src[k][2]);
 145                 int dy2 = __float2int_rn(ratio * src[k][3]);
 146
 147                 real_t t = 0;
 148                 t += tex2D(sumTex, x + dx1, y + dy1);
 149                 t -= tex2D(sumTex, x + dx1, y + dy2);
 150                 t -= tex2D(sumTex, x + dx2, y + dy1);
 151                 t += tex2D(sumTex, x + dx2, y + dy2);
 152
 153                 d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
 154             }
 155
 156             return (float)d;
 157         }
 158
 159         ////////////////////////////////////////////////////////////////////////
 160         // Hessian
 161
 162         __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
 163         __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
 164         __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
 165
 166         __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
 167         {
 168             /* Wavelet size at first layer of first octave. */
 169             const int HAAR_SIZE0 = 9;
 170
 171             /* Wavelet size increment between layers. This should be an even number,
 172              such that the wavelet sizes in an octave are either all even or all odd.
 173              This ensures that when looking for the neighbours of a sample, the layers
 174              above and below are aligned correctly. */
 175             const int HAAR_SIZE_INC = 6;
 176
 177             return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 178         }
 179
 180         __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
 181         {
 182             // Determine the indices
 183             const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
 184             const int blockIdx_y = blockIdx.y % gridDim_y;
 185             const int blockIdx_z = blockIdx.y / gridDim_y;
 186
 187             const int j = threadIdx.x + blockIdx.x * blockDim.x;
 188             const int i = threadIdx.y + blockIdx_y * blockDim.y;
 189             const int layer = blockIdx_z;
 190
 191             const int size = calcSize(c_octave, layer);
 192
 193             const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
 194             const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
 195
 196             // Ignore pixels where some of the kernel is outside the image
 197             const int margin = (size >> 1) >> c_octave;
 198
 199             if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
 200             {
 201                 const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
 202                 const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
 203                 const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
 204
 205                 det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
 206                 trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
 207             }
 208         }
 209
 210         void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
 211             int octave, int nOctaveLayers)
 212         {
 213             const int min_size = calcSize(octave, 0);
 214             const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
 215             const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
 216
 217             dim3 threads(16, 16);
 218
 219             dim3 grid;
 220             grid.x = divUp(max_samples_j, threads.x);
 221             grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
 222
 223             icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
 224             cudaSafeCall( cudaGetLastError() );
 225
 226             cudaSafeCall( cudaDeviceSynchronize() );
 227         }
 228
 229         ////////////////////////////////////////////////////////////////////////
 230         // NONMAX
 231
 232         __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
 233
 234         struct WithMask
 235         {
 236             static __device__ bool check(int sum_i, int sum_j, int size)
 237             {
 238                 float ratio = (float)size / 9.0f;
 239
 240                 float d = 0;
 241
 242                 int dx1 = __float2int_rn(ratio * c_DM[0]);
 243                 int dy1 = __float2int_rn(ratio * c_DM[1]);
 244                 int dx2 = __float2int_rn(ratio * c_DM[2]);
 245                 int dy2 = __float2int_rn(ratio * c_DM[3]);
 246
 247                 float t = 0;
 248                 t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
 249                 t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
 250                 t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
 251                 t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
 252
 253                 d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
 254
 255                 return (d >= 0.5f);
 256             }
 257         };
 258
 259         template <typename Mask>
 260         __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
 261             unsigned int* maxCounter)
 262         {
 263             #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
 264
 265             extern __shared__ float N9[];
 266
 267             // The hidx variables are the indices to the hessian buffer.
 268             const int gridDim_y = gridDim.y / c_nOctaveLayers;
 269             const int blockIdx_y = blockIdx.y % gridDim_y;
 270             const int blockIdx_z = blockIdx.y / gridDim_y;
 271
 272             const int layer = blockIdx_z + 1;
 273
 274             const int size = calcSize(c_octave, layer);
 275
 276             // Ignore pixels without a 3x3x3 neighbourhood in the layer above
 277             const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
 278
 279             const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
 280             const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
 281
 282             // Is this thread within the hessian buffer?
 283             const int zoff = blockDim.x * blockDim.y;
 284             const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
 285             N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
 286             N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
 287             N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
 288             __syncthreads();
 289
 290             if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
 291             {
 292                 float val0 = N9[localLin];
 293
 294                 if (val0 > c_hessianThreshold)
 295                 {
 296                     // Coordinates for the start of the wavelet in the sum image. There
 297                     // is some integer division involved, so don't try to simplify this
 298                     // (cancel out sampleStep) without checking the result is the same
 299                     const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
 300                     const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
 301
 302                     if (Mask::check(sum_i, sum_j, size))
 303                     {
 304                         // Check to see if we have a max (in its 26 neighbours)
 305                         const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
 306                         &&                   val0 > N9[localLin     - blockDim.x - zoff]
 307                         &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]
 308                         &&                   val0 > N9[localLin - 1              - zoff]
 309                         &&                   val0 > N9[localLin                  - zoff]
 310                         &&                   val0 > N9[localLin + 1              - zoff]
 311                         &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]
 312                         &&                   val0 > N9[localLin     + blockDim.x - zoff]
 313                         &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]
 314
 315                         &&                   val0 > N9[localLin - 1 - blockDim.x]
 316                         &&                   val0 > N9[localLin     - blockDim.x]
 317                         &&                   val0 > N9[localLin + 1 - blockDim.x]
 318                         &&                   val0 > N9[localLin - 1             ]
 319                         &&                   val0 > N9[localLin + 1             ]
 320                         &&                   val0 > N9[localLin - 1 + blockDim.x]
 321                         &&                   val0 > N9[localLin     + blockDim.x]
 322                         &&                   val0 > N9[localLin + 1 + blockDim.x]
 323
 324                         &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]
 325                         &&                   val0 > N9[localLin     - blockDim.x + zoff]
 326                         &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]
 327                         &&                   val0 > N9[localLin - 1              + zoff]
 328                         &&                   val0 > N9[localLin                  + zoff]
 329                         &&                   val0 > N9[localLin + 1              + zoff]
 330                         &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]
 331                         &&                   val0 > N9[localLin     + blockDim.x + zoff]
 332                         &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]
 333                         ;
 334
 335                         if(condmax)
 336                         {
 337                             unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
 338
 339                             if (ind < c_max_candidates)
 340                             {
 341                                 const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
 342
 343                                 maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
 344                             }
 345                         }
 346                     }
 347                 }
 348             }
 349
 350             #endif
 351         }
 352
 353         void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
 354             int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
 355         {
 356             const int layer_rows = img_rows >> octave;
 357             const int layer_cols = img_cols >> octave;
 358
 359             const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
 360
 361             dim3 threads(16, 16);
 362
 363             dim3 grid;
 364             grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
 365             grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
 366
 367             const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
 368
 369             if (use_mask)
 370                 icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
 371             else
 372                 icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
 373
 374             cudaSafeCall( cudaGetLastError() );
 375
 376             cudaSafeCall( cudaDeviceSynchronize() );
 377         }
 378
 379         ////////////////////////////////////////////////////////////////////////
 380         // INTERPOLATION
 381
 382         __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
 383             float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
 384             unsigned int* featureCounter)
 385         {
 386             #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
 387
 388             const int4 maxPos = maxPosBuffer[blockIdx.x];
 389
 390             const int j = maxPos.x - 1 + threadIdx.x;
 391             const int i = maxPos.y - 1 + threadIdx.y;
 392             const int layer = maxPos.z - 1 + threadIdx.z;
 393
 394             __shared__ float N9[3][3][3];
 395
 396             N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
 397             __syncthreads();
 398
 399             if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
 400             {
 401                 __shared__ float dD[3];
 402
 403                 //dx
 404                 dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
 405                 //dy
 406                 dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
 407                 //ds
 408                 dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
 409
 410                 __shared__ float H[3][3];
 411
 412                 //dxx
 413                 H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
 414                 //dxy
 415                 H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
 416                 //dxs
 417                 H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
 418                 //dyx = dxy
 419                 H[1][0] = H[0][1];
 420                 //dyy
 421                 H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
 422                 //dys
 423                 H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
 424                 //dsx = dxs
 425                 H[2][0] = H[0][2];
 426                 //dsy = dys
 427                 H[2][1] = H[1][2];
 428                 //dss
 429                 H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
 430
 431                 __shared__ float x[3];
 432
 433                 if (solve3x3(H, dD, x))
 434                 {
 435                     if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
 436                     {
 437                         // if the step is within the interpolation region, perform it
 438
 439                         const int size = calcSize(c_octave, maxPos.z);
 440
 441                         const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
 442                         const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
 443
 444                         const float center_i = sum_i + (float)(size - 1) / 2;
 445                         const float center_j = sum_j + (float)(size - 1) / 2;
 446
 447                         const float px = center_j + x[0] * (1 << c_octave);
 448                         const float py = center_i + x[1] * (1 << c_octave);
 449
 450                         const int ds = size - calcSize(c_octave, maxPos.z - 1);
 451                         const float psize = roundf(size + x[2] * ds);
 452
 453                         /* The sampling intervals and wavelet sized for selecting an orientation
 454                          and building the keypoint descriptor are defined relative to 's' */
 455                         const float s = psize * 1.2f / 9.0f;
 456
 457                         /* To find the dominant orientation, the gradients in x and y are
 458                          sampled in a circle of radius 6s using wavelets of size 4s.
 459                          We ensure the gradient wavelet size is even to ensure the
 460                          wavelet pattern is balanced and symmetric around its center */
 461                         const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
 462
 463                         // check when grad_wav_size is too big
 464                         if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
 465                         {
 466                             // Get a new feature index.
 467                             unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
 468
 469                             if (ind < c_max_features)
 470                             {
 471                                 featureX[ind] = px;
 472                                 featureY[ind] = py;
 473                                 featureLaplacian[ind] = maxPos.w;
 474                                 featureOctave[ind] = c_octave;
 475                                 featureSize[ind] = psize;
 476                                 featureHessian[ind] = N9[1][1][1];
 477                             }
 478                         } // grad_wav_size check
 479                     } // If the subpixel interpolation worked
 480                 }
 481             } // If this is thread 0.
 482
 483             #endif
 484         }
 485
 486         void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
 487             float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
 488             unsigned int* featureCounter)
 489         {
 490             dim3 threads;
 491             threads.x = 3;
 492             threads.y = 3;
 493             threads.z = 3;
 494
 495             dim3 grid;
 496             grid.x = maxCounter;
 497
 498             icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureOctave, featureSize, featureHessian, featureCounter);
 499             cudaSafeCall( cudaGetLastError() );
 500
 501             cudaSafeCall( cudaDeviceSynchronize() );
 502         }
 503
 504         ////////////////////////////////////////////////////////////////////////
 505         // Orientation
 506
 507         #define ORI_SEARCH_INC 5
 508         #define ORI_WIN        60
 509         #define ORI_SAMPLES    113
 510
 511         __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 512         __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
 513         __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
 514
 515         __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
 516         __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
 517
 518         __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
 519         {
 520             __shared__ float s_X[128];
 521             __shared__ float s_Y[128];
 522             __shared__ float s_angle[128];
 523
 524             __shared__ float s_sumx[32 * 4];
 525             __shared__ float s_sumy[32 * 4];
 526
 527             /* The sampling intervals and wavelet sized for selecting an orientation
 528              and building the keypoint descriptor are defined relative to 's' */
 529             const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
 530
 531             /* To find the dominant orientation, the gradients in x and y are
 532              sampled in a circle of radius 6s using wavelets of size 4s.
 533              We ensure the gradient wavelet size is even to ensure the
 534              wavelet pattern is balanced and symmetric around its center */
 535             const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
 536
 537             // check when grad_wav_size is too big
 538             if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
 539                 return;
 540
 541             // Calc X, Y, angle and store it to shared memory
 542             const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 543
 544             float X = 0.0f, Y = 0.0f, angle = 0.0f;
 545
 546             if (tid < ORI_SAMPLES)
 547             {
 548                 const float margin = (float)(grad_wav_size - 1) / 2.0f;
 549                 const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
 550                 const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
 551
 552                 if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
 553                     x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
 554                 {
 555                     X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
 556                     Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
 557
 558                     angle = atan2f(Y, X);
 559                     if (angle < 0)
 560                         angle += 2.0f * CV_PI_F;
 561                     angle *= 180.0f / CV_PI_F;
 562                 }
 563             }
 564             s_X[tid] = X;
 565             s_Y[tid] = Y;
 566             s_angle[tid] = angle;
 567             __syncthreads();
 568
 569             float bestx = 0, besty = 0, best_mod = 0;
 570
 571         #if __CUDA_ARCH__ >= 200
 572             #pragma unroll
 573         #endif
 574             for (int i = 0; i < 18; ++i)
 575             {
 576                 const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
 577
 578                 float sumx = 0.0f, sumy = 0.0f;
 579                 int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
 580                 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 581                 {
 582                     sumx = s_X[threadIdx.x];
 583                     sumy = s_Y[threadIdx.x];
 584                 }
 585                 d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
 586                 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 587                 {
 588                     sumx += s_X[threadIdx.x + 32];
 589                     sumy += s_Y[threadIdx.x + 32];
 590                 }
 591                 d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
 592                 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 593                 {
 594                     sumx += s_X[threadIdx.x + 64];
 595                     sumy += s_Y[threadIdx.x + 64];
 596                 }
 597                 d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
 598                 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 599                 {
 600                     sumx += s_X[threadIdx.x + 96];
 601                     sumy += s_Y[threadIdx.x + 96];
 602                 }
 603
 604                 plus<float> op;
 605                 device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
 606                                    thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
 607
 608                 const float temp_mod = sumx * sumx + sumy * sumy;
 609                 if (temp_mod > best_mod)
 610                 {
 611                     best_mod = temp_mod;
 612                     bestx = sumx;
 613                     besty = sumy;
 614                 }
 615
 616                 __syncthreads();
 617             }
 618
 619             if (threadIdx.x == 0)
 620             {
 621                 s_X[threadIdx.y] = bestx;
 622                 s_Y[threadIdx.y] = besty;
 623                 s_angle[threadIdx.y] = best_mod;
 624             }
 625             __syncthreads();
 626
 627             if (threadIdx.x == 0 && threadIdx.y == 0)
 628             {
 629                 int bestIdx = 0;
 630
 631                 if (s_angle[1] > s_angle[bestIdx])
 632                     bestIdx = 1;
 633                 if (s_angle[2] > s_angle[bestIdx])
 634                     bestIdx = 2;
 635                 if (s_angle[3] > s_angle[bestIdx])
 636                     bestIdx = 3;
 637
 638                 float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
 639                 if (kp_dir < 0)
 640                     kp_dir += 2.0f * CV_PI_F;
 641                 kp_dir *= 180.0f / CV_PI_F;
 642
 643                 kp_dir = 360.0f - kp_dir;
 644                 if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
 645                     kp_dir = 0.f;
 646
 647                 featureDir[blockIdx.x] = kp_dir;
 648             }
 649         }
 650
 651         #undef ORI_SEARCH_INC
 652         #undef ORI_WIN
 653         #undef ORI_SAMPLES
 654
 655         void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
 656         {
 657             dim3 threads;
 658             threads.x = 32;
 659             threads.y = 4;
 660
 661             dim3 grid;
 662             grid.x = nFeatures;
 663
 664             icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
 665             cudaSafeCall( cudaGetLastError() );
 666
 667             cudaSafeCall( cudaDeviceSynchronize() );
 668         }
 669
 670         ////////////////////////////////////////////////////////////////////////
 671         // Descriptors
 672
 673         #define PATCH_SZ 20
 674
 675         __constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
 676         {
 677             3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
 678             8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
 679             1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
 680             3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
 681             5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
 682             9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
 683             0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
 684             0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
 685             0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
 686             0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
 687             0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
 688             0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
 689             0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
 690             0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
 691             9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
 692             5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
 693             3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
 694             1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
 695             8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
 696             3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
 697         };
 698
 699         struct WinReader
 700         {
 701             typedef uchar elem_type;
 702
 703             __device__ __forceinline__ uchar operator ()(int i, int j) const
 704             {
 705                 float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
 706                 float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
 707
 708                 return tex2D(imgTex, pixel_x, pixel_y);
 709             }
 710
 711             float centerX;
 712             float centerY;
 713             float win_offset;
 714             float cos_dir;
 715             float sin_dir;
 716             int width;
 717             int height;
 718         };
 719
 720         __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
 721                                    float& dx, float& dy)
 722         {
 723             __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
 724
 725             dx = dy = 0.0f;
 726
 727             WinReader win;
 728
 729             win.centerX = featureX[blockIdx.x];
 730             win.centerY = featureY[blockIdx.x];
 731
 732             // The sampling intervals and wavelet sized for selecting an orientation
 733             // and building the keypoint descriptor are defined relative to 's'
 734             const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
 735
 736             // Extract a window of pixels around the keypoint of size 20s
 737             const int win_size = (int)((PATCH_SZ + 1) * s);
 738
 739             win.width = win.height = win_size;
 740
 741             // Nearest neighbour version (faster)
 742             win.win_offset = -(win_size - 1.0f) / 2.0f;
 743
 744             float descriptor_dir = 360.0f - featureDir[blockIdx.x];
 745             if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
 746                 descriptor_dir = 0.f;
 747             descriptor_dir *= CV_PI_F / 180.0f;
 748             sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
 749
 750             const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 751
 752             const int xLoadInd = tid % (PATCH_SZ + 1);
 753             const int yLoadInd = tid / (PATCH_SZ + 1);
 754
 755             if (yLoadInd < (PATCH_SZ + 1))
 756             {
 757                 if (s > 1)
 758                 {
 759                     AreaFilter<WinReader> filter(win, s, s);
 760                     s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
 761                 }
 762                 else
 763                 {
 764                     LinearFilter<WinReader> filter(win);
 765                     s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
 766                 }
 767             }
 768
 769             __syncthreads();
 770
 771             const int xPatchInd = threadIdx.x % 5;
 772             const int yPatchInd = threadIdx.x / 5;
 773
 774             if (yPatchInd < 5)
 775             {
 776                 const int xBlockInd = threadIdx.y % 4;
 777                 const int yBlockInd = threadIdx.y / 4;
 778
 779                 const int xInd = xBlockInd * 5 + xPatchInd;
 780                 const int yInd = yBlockInd * 5 + yPatchInd;
 781
 782                 const float dw = c_DW[yInd * PATCH_SZ + xInd];
 783
 784                 dx = (s_PATCH[yInd    ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd    ]) * dw;
 785                 dy = (s_PATCH[yInd + 1][xInd    ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd    ][xInd + 1]) * dw;
 786             }
 787         }
 788
 789         __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
 790         {
 791             __shared__ float smem[32 * 16];
 792
 793             float* sRow = smem + threadIdx.y * 32;
 794
 795             float dx, dy;
 796             calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
 797
 798             float dxabs = ::fabsf(dx);
 799             float dyabs = ::fabsf(dy);
 800
 801             plus<float> op;
 802
 803             reduce<32>(sRow, dx, threadIdx.x, op);
 804             reduce<32>(sRow, dy, threadIdx.x, op);
 805             reduce<32>(sRow, dxabs, threadIdx.x, op);
 806             reduce<32>(sRow, dyabs, threadIdx.x, op);
 807
 808             float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
 809
 810             // write dx, dy, |dx|, |dy|
 811             if (threadIdx.x == 0)
 812                 *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
 813         }
 814
 815         __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
 816         {
 817             __shared__ float smem[32 * 16];
 818
 819             float* sRow = smem + threadIdx.y * 32;
 820
 821             float dx, dy;
 822             calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
 823
 824             float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
 825
 826             plus<float> op;
 827
 828             float d1 = 0.0f;
 829             float d2 = 0.0f;
 830             float abs1 = 0.0f;
 831             float abs2 = 0.0f;
 832
 833             if (dy >= 0)
 834             {
 835                 d1 = dx;
 836                 abs1 = ::fabsf(dx);
 837             }
 838             else
 839             {
 840                 d2 = dx;
 841                 abs2 = ::fabsf(dx);
 842             }
 843
 844             reduce<32>(sRow, d1, threadIdx.x, op);
 845             reduce<32>(sRow, d2, threadIdx.x, op);
 846             reduce<32>(sRow, abs1, threadIdx.x, op);
 847             reduce<32>(sRow, abs2, threadIdx.x, op);
 848
 849             // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
 850             if (threadIdx.x == 0)
 851                 descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
 852
 853             if (dx >= 0)
 854             {
 855                 d1 = dy;
 856                 abs1 = ::fabsf(dy);
 857                 d2 = 0.0f;
 858                 abs2 = 0.0f;
 859             }
 860             else
 861             {
 862                 d1 = 0.0f;
 863                 abs1 = 0.0f;
 864                 d2 = dy;
 865                 abs2 = ::fabsf(dy);
 866             }
 867
 868             reduce<32>(sRow, d1, threadIdx.x, op);
 869             reduce<32>(sRow, d2, threadIdx.x, op);
 870             reduce<32>(sRow, abs1, threadIdx.x, op);
 871             reduce<32>(sRow, abs2, threadIdx.x, op);
 872
 873             // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
 874             if (threadIdx.x == 0)
 875                 descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
 876         }
 877
 878         template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
 879         {
 880             __shared__ float smem[BLOCK_DIM_X];
 881             __shared__ float s_len;
 882
 883             // no need for thread ID
 884             float* descriptor_base = descriptors.ptr(blockIdx.x);
 885
 886             // read in the unnormalized descriptor values (squared)
 887             const float val = descriptor_base[threadIdx.x];
 888
 889             float len = val * val;
 890             reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
 891
 892             if (threadIdx.x == 0)
 893                 s_len = ::sqrtf(len);
 894
 895             __syncthreads();
 896
 897             // normalize and store in output
 898             descriptor_base[threadIdx.x] = val / s_len;
 899         }
 900
 901         void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
 902         {
 903             // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
 904
 905             if (descriptors.cols == 64)
 906             {
 907                 compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
 908                 cudaSafeCall( cudaGetLastError() );
 909
 910                 cudaSafeCall( cudaDeviceSynchronize() );
 911
 912                 normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
 913                 cudaSafeCall( cudaGetLastError() );
 914
 915                 cudaSafeCall( cudaDeviceSynchronize() );
 916             }
 917             else
 918             {
 919                 compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
 920                 cudaSafeCall( cudaGetLastError() );
 921
 922                 cudaSafeCall( cudaDeviceSynchronize() );
 923
 924                 normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
 925                 cudaSafeCall( cudaGetLastError() );
 926
 927                 cudaSafeCall( cudaDeviceSynchronize() );
 928             }
 929         }
 930     } // namespace surf
 931 }}} // namespace cv { namespace gpu { namespace device
 932
 933
 934 #endif /* CUDA_DISABLER */