1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
41 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
43 // The original code was written by Paul Furgale and Chi Hay Tong
44 // and later optimized and prepared for integration into OpenCV by Itseez.
48 #if !defined CUDA_DISABLER
50 #include "opencv2/gpu/device/common.hpp"
51 #include "opencv2/gpu/device/limits.hpp"
52 #include "opencv2/gpu/device/saturate_cast.hpp"
53 #include "opencv2/gpu/device/reduce.hpp"
54 #include "opencv2/gpu/device/utility.hpp"
55 #include "opencv2/gpu/device/functional.hpp"
56 #include "opencv2/gpu/device/filters.hpp"
58 namespace cv { namespace gpu { namespace device
62 ////////////////////////////////////////////////////////////////////////
65 // The maximum number of features (before subpixel interpolation) that memory is reserved for.
66 __constant__ int c_max_candidates;
67 // The maximum number of features that memory is reserved for.
68 __constant__ int c_max_features;
70 __constant__ int c_img_rows;
71 __constant__ int c_img_cols;
72 // The number of layers.
73 __constant__ int c_nOctaveLayers;
74 // The hessian threshold.
75 __constant__ float c_hessianThreshold;
77 // The current octave.
78 __constant__ int c_octave;
79 // The current layer size.
80 __constant__ int c_layer_rows;
81 __constant__ int c_layer_cols;
83 void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
85 cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
86 cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
87 cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
88 cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
89 cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
90 cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
93 void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
95 cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
96 cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
97 cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
100 ////////////////////////////////////////////////////////////////////////
101 // Integral image texture
103 texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
104 texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
105 texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
107 void bindImgTex(PtrStepSzb img)
109 bindTexture(&imgTex, img);
112 size_t bindSumTex(PtrStepSz<uint> sum)
115 cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
116 cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
117 return offset / sizeof(uint);
119 size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
122 cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
123 cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
124 return offset / sizeof(uint);
127 template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
129 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
130 typedef double real_t;
132 typedef float real_t;
135 float ratio = (float)newSize / oldSize;
140 for (int k = 0; k < N; ++k)
142 int dx1 = __float2int_rn(ratio * src[k][0]);
143 int dy1 = __float2int_rn(ratio * src[k][1]);
144 int dx2 = __float2int_rn(ratio * src[k][2]);
145 int dy2 = __float2int_rn(ratio * src[k][3]);
148 t += tex2D(sumTex, x + dx1, y + dy1);
149 t -= tex2D(sumTex, x + dx1, y + dy2);
150 t -= tex2D(sumTex, x + dx2, y + dy1);
151 t += tex2D(sumTex, x + dx2, y + dy2);
153 d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
159 ////////////////////////////////////////////////////////////////////////
162 __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
163 __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
164 __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
166 __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
168 /* Wavelet size at first layer of first octave. */
169 const int HAAR_SIZE0 = 9;
171 /* Wavelet size increment between layers. This should be an even number,
172 such that the wavelet sizes in an octave are either all even or all odd.
173 This ensures that when looking for the neighbours of a sample, the layers
174 above and below are aligned correctly. */
175 const int HAAR_SIZE_INC = 6;
177 return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
180 __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
182 // Determine the indices
183 const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
184 const int blockIdx_y = blockIdx.y % gridDim_y;
185 const int blockIdx_z = blockIdx.y / gridDim_y;
187 const int j = threadIdx.x + blockIdx.x * blockDim.x;
188 const int i = threadIdx.y + blockIdx_y * blockDim.y;
189 const int layer = blockIdx_z;
191 const int size = calcSize(c_octave, layer);
193 const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
194 const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
196 // Ignore pixels where some of the kernel is outside the image
197 const int margin = (size >> 1) >> c_octave;
199 if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
201 const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
202 const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
203 const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
205 det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
206 trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
210 void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
211 int octave, int nOctaveLayers)
213 const int min_size = calcSize(octave, 0);
214 const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
215 const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
217 dim3 threads(16, 16);
220 grid.x = divUp(max_samples_j, threads.x);
221 grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
223 icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
224 cudaSafeCall( cudaGetLastError() );
226 cudaSafeCall( cudaDeviceSynchronize() );
229 ////////////////////////////////////////////////////////////////////////
232 __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
236 static __device__ bool check(int sum_i, int sum_j, int size)
238 float ratio = (float)size / 9.0f;
242 int dx1 = __float2int_rn(ratio * c_DM[0]);
243 int dy1 = __float2int_rn(ratio * c_DM[1]);
244 int dx2 = __float2int_rn(ratio * c_DM[2]);
245 int dy2 = __float2int_rn(ratio * c_DM[3]);
248 t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
249 t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
250 t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
251 t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
253 d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
259 template <typename Mask>
260 __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
261 unsigned int* maxCounter)
263 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
265 extern __shared__ float N9[];
267 // The hidx variables are the indices to the hessian buffer.
268 const int gridDim_y = gridDim.y / c_nOctaveLayers;
269 const int blockIdx_y = blockIdx.y % gridDim_y;
270 const int blockIdx_z = blockIdx.y / gridDim_y;
272 const int layer = blockIdx_z + 1;
274 const int size = calcSize(c_octave, layer);
276 // Ignore pixels without a 3x3x3 neighbourhood in the layer above
277 const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
279 const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
280 const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
282 // Is this thread within the hessian buffer?
283 const int zoff = blockDim.x * blockDim.y;
284 const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
285 N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
286 N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
287 N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
290 if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
292 float val0 = N9[localLin];
294 if (val0 > c_hessianThreshold)
296 // Coordinates for the start of the wavelet in the sum image. There
297 // is some integer division involved, so don't try to simplify this
298 // (cancel out sampleStep) without checking the result is the same
299 const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
300 const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
302 if (Mask::check(sum_i, sum_j, size))
304 // Check to see if we have a max (in its 26 neighbours)
305 const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
306 && val0 > N9[localLin - blockDim.x - zoff]
307 && val0 > N9[localLin + 1 - blockDim.x - zoff]
308 && val0 > N9[localLin - 1 - zoff]
309 && val0 > N9[localLin - zoff]
310 && val0 > N9[localLin + 1 - zoff]
311 && val0 > N9[localLin - 1 + blockDim.x - zoff]
312 && val0 > N9[localLin + blockDim.x - zoff]
313 && val0 > N9[localLin + 1 + blockDim.x - zoff]
315 && val0 > N9[localLin - 1 - blockDim.x]
316 && val0 > N9[localLin - blockDim.x]
317 && val0 > N9[localLin + 1 - blockDim.x]
318 && val0 > N9[localLin - 1 ]
319 && val0 > N9[localLin + 1 ]
320 && val0 > N9[localLin - 1 + blockDim.x]
321 && val0 > N9[localLin + blockDim.x]
322 && val0 > N9[localLin + 1 + blockDim.x]
324 && val0 > N9[localLin - 1 - blockDim.x + zoff]
325 && val0 > N9[localLin - blockDim.x + zoff]
326 && val0 > N9[localLin + 1 - blockDim.x + zoff]
327 && val0 > N9[localLin - 1 + zoff]
328 && val0 > N9[localLin + zoff]
329 && val0 > N9[localLin + 1 + zoff]
330 && val0 > N9[localLin - 1 + blockDim.x + zoff]
331 && val0 > N9[localLin + blockDim.x + zoff]
332 && val0 > N9[localLin + 1 + blockDim.x + zoff]
337 unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
339 if (ind < c_max_candidates)
341 const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
343 maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
353 void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
354 int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
356 const int layer_rows = img_rows >> octave;
357 const int layer_cols = img_cols >> octave;
359 const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
361 dim3 threads(16, 16);
364 grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
365 grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
367 const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
370 icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
372 icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
374 cudaSafeCall( cudaGetLastError() );
376 cudaSafeCall( cudaDeviceSynchronize() );
379 ////////////////////////////////////////////////////////////////////////
382 __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
383 float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
384 unsigned int* featureCounter)
386 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
388 const int4 maxPos = maxPosBuffer[blockIdx.x];
390 const int j = maxPos.x - 1 + threadIdx.x;
391 const int i = maxPos.y - 1 + threadIdx.y;
392 const int layer = maxPos.z - 1 + threadIdx.z;
394 __shared__ float N9[3][3][3];
396 N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
399 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
401 __shared__ float dD[3];
404 dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
406 dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
408 dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
410 __shared__ float H[3][3];
413 H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
415 H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
417 H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
421 H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
423 H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
429 H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
431 __shared__ float x[3];
433 if (solve3x3(H, dD, x))
435 if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
437 // if the step is within the interpolation region, perform it
439 const int size = calcSize(c_octave, maxPos.z);
441 const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
442 const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
444 const float center_i = sum_i + (float)(size - 1) / 2;
445 const float center_j = sum_j + (float)(size - 1) / 2;
447 const float px = center_j + x[0] * (1 << c_octave);
448 const float py = center_i + x[1] * (1 << c_octave);
450 const int ds = size - calcSize(c_octave, maxPos.z - 1);
451 const float psize = roundf(size + x[2] * ds);
453 /* The sampling intervals and wavelet sized for selecting an orientation
454 and building the keypoint descriptor are defined relative to 's' */
455 const float s = psize * 1.2f / 9.0f;
457 /* To find the dominant orientation, the gradients in x and y are
458 sampled in a circle of radius 6s using wavelets of size 4s.
459 We ensure the gradient wavelet size is even to ensure the
460 wavelet pattern is balanced and symmetric around its center */
461 const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
463 // check when grad_wav_size is too big
464 if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
466 // Get a new feature index.
467 unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
469 if (ind < c_max_features)
473 featureLaplacian[ind] = maxPos.w;
474 featureOctave[ind] = c_octave;
475 featureSize[ind] = psize;
476 featureHessian[ind] = N9[1][1][1];
478 } // grad_wav_size check
479 } // If the subpixel interpolation worked
481 } // If this is thread 0.
486 void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
487 float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
488 unsigned int* featureCounter)
498 icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureOctave, featureSize, featureHessian, featureCounter);
499 cudaSafeCall( cudaGetLastError() );
501 cudaSafeCall( cudaDeviceSynchronize() );
504 ////////////////////////////////////////////////////////////////////////
507 #define ORI_SEARCH_INC 5
509 #define ORI_SAMPLES 113
511 __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
512 __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
513 __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
515 __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
516 __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
518 __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
520 __shared__ float s_X[128];
521 __shared__ float s_Y[128];
522 __shared__ float s_angle[128];
524 __shared__ float s_sumx[32 * 4];
525 __shared__ float s_sumy[32 * 4];
527 /* The sampling intervals and wavelet sized for selecting an orientation
528 and building the keypoint descriptor are defined relative to 's' */
529 const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
531 /* To find the dominant orientation, the gradients in x and y are
532 sampled in a circle of radius 6s using wavelets of size 4s.
533 We ensure the gradient wavelet size is even to ensure the
534 wavelet pattern is balanced and symmetric around its center */
535 const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
537 // check when grad_wav_size is too big
538 if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
541 // Calc X, Y, angle and store it to shared memory
542 const int tid = threadIdx.y * blockDim.x + threadIdx.x;
544 float X = 0.0f, Y = 0.0f, angle = 0.0f;
546 if (tid < ORI_SAMPLES)
548 const float margin = (float)(grad_wav_size - 1) / 2.0f;
549 const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
550 const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
552 if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
553 x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
555 X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
556 Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
558 angle = atan2f(Y, X);
560 angle += 2.0f * CV_PI_F;
561 angle *= 180.0f / CV_PI_F;
566 s_angle[tid] = angle;
569 float bestx = 0, besty = 0, best_mod = 0;
571 #if __CUDA_ARCH__ >= 200
574 for (int i = 0; i < 18; ++i)
576 const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
578 float sumx = 0.0f, sumy = 0.0f;
579 int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
580 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
582 sumx = s_X[threadIdx.x];
583 sumy = s_Y[threadIdx.x];
585 d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
586 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
588 sumx += s_X[threadIdx.x + 32];
589 sumy += s_Y[threadIdx.x + 32];
591 d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
592 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
594 sumx += s_X[threadIdx.x + 64];
595 sumy += s_Y[threadIdx.x + 64];
597 d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
598 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
600 sumx += s_X[threadIdx.x + 96];
601 sumy += s_Y[threadIdx.x + 96];
605 device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
606 thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
608 const float temp_mod = sumx * sumx + sumy * sumy;
609 if (temp_mod > best_mod)
619 if (threadIdx.x == 0)
621 s_X[threadIdx.y] = bestx;
622 s_Y[threadIdx.y] = besty;
623 s_angle[threadIdx.y] = best_mod;
627 if (threadIdx.x == 0 && threadIdx.y == 0)
631 if (s_angle[1] > s_angle[bestIdx])
633 if (s_angle[2] > s_angle[bestIdx])
635 if (s_angle[3] > s_angle[bestIdx])
638 float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
640 kp_dir += 2.0f * CV_PI_F;
641 kp_dir *= 180.0f / CV_PI_F;
643 kp_dir = 360.0f - kp_dir;
644 if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
647 featureDir[blockIdx.x] = kp_dir;
651 #undef ORI_SEARCH_INC
655 void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
664 icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
665 cudaSafeCall( cudaGetLastError() );
667 cudaSafeCall( cudaDeviceSynchronize() );
670 ////////////////////////////////////////////////////////////////////////
675 __constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
677 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
678 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
679 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
680 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
681 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
682 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
683 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
684 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
685 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
686 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
687 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
688 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
689 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
690 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
691 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
692 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
693 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
694 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
695 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
696 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
701 typedef uchar elem_type;
703 __device__ __forceinline__ uchar operator ()(int i, int j) const
705 float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
706 float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
708 return tex2D(imgTex, pixel_x, pixel_y);
720 __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
721 float& dx, float& dy)
723 __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
729 win.centerX = featureX[blockIdx.x];
730 win.centerY = featureY[blockIdx.x];
732 // The sampling intervals and wavelet sized for selecting an orientation
733 // and building the keypoint descriptor are defined relative to 's'
734 const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
736 // Extract a window of pixels around the keypoint of size 20s
737 const int win_size = (int)((PATCH_SZ + 1) * s);
739 win.width = win.height = win_size;
741 // Nearest neighbour version (faster)
742 win.win_offset = -(win_size - 1.0f) / 2.0f;
744 float descriptor_dir = 360.0f - featureDir[blockIdx.x];
745 if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
746 descriptor_dir = 0.f;
747 descriptor_dir *= CV_PI_F / 180.0f;
748 sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
750 const int tid = threadIdx.y * blockDim.x + threadIdx.x;
752 const int xLoadInd = tid % (PATCH_SZ + 1);
753 const int yLoadInd = tid / (PATCH_SZ + 1);
755 if (yLoadInd < (PATCH_SZ + 1))
759 AreaFilter<WinReader> filter(win, s, s);
760 s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
764 LinearFilter<WinReader> filter(win);
765 s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
771 const int xPatchInd = threadIdx.x % 5;
772 const int yPatchInd = threadIdx.x / 5;
776 const int xBlockInd = threadIdx.y % 4;
777 const int yBlockInd = threadIdx.y / 4;
779 const int xInd = xBlockInd * 5 + xPatchInd;
780 const int yInd = yBlockInd * 5 + yPatchInd;
782 const float dw = c_DW[yInd * PATCH_SZ + xInd];
784 dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw;
785 dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw;
789 __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
791 __shared__ float smem[32 * 16];
793 float* sRow = smem + threadIdx.y * 32;
796 calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
798 float dxabs = ::fabsf(dx);
799 float dyabs = ::fabsf(dy);
803 reduce<32>(sRow, dx, threadIdx.x, op);
804 reduce<32>(sRow, dy, threadIdx.x, op);
805 reduce<32>(sRow, dxabs, threadIdx.x, op);
806 reduce<32>(sRow, dyabs, threadIdx.x, op);
808 float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
810 // write dx, dy, |dx|, |dy|
811 if (threadIdx.x == 0)
812 *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
815 __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
817 __shared__ float smem[32 * 16];
819 float* sRow = smem + threadIdx.y * 32;
822 calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
824 float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
844 reduce<32>(sRow, d1, threadIdx.x, op);
845 reduce<32>(sRow, d2, threadIdx.x, op);
846 reduce<32>(sRow, abs1, threadIdx.x, op);
847 reduce<32>(sRow, abs2, threadIdx.x, op);
849 // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
850 if (threadIdx.x == 0)
851 descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
868 reduce<32>(sRow, d1, threadIdx.x, op);
869 reduce<32>(sRow, d2, threadIdx.x, op);
870 reduce<32>(sRow, abs1, threadIdx.x, op);
871 reduce<32>(sRow, abs2, threadIdx.x, op);
873 // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
874 if (threadIdx.x == 0)
875 descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
878 template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
880 __shared__ float smem[BLOCK_DIM_X];
881 __shared__ float s_len;
883 // no need for thread ID
884 float* descriptor_base = descriptors.ptr(blockIdx.x);
886 // read in the unnormalized descriptor values (squared)
887 const float val = descriptor_base[threadIdx.x];
889 float len = val * val;
890 reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
892 if (threadIdx.x == 0)
893 s_len = ::sqrtf(len);
897 // normalize and store in output
898 descriptor_base[threadIdx.x] = val / s_len;
901 void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
903 // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
905 if (descriptors.cols == 64)
907 compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
908 cudaSafeCall( cudaGetLastError() );
910 cudaSafeCall( cudaDeviceSynchronize() );
912 normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
913 cudaSafeCall( cudaGetLastError() );
915 cudaSafeCall( cudaDeviceSynchronize() );
919 compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
920 cudaSafeCall( cudaGetLastError() );
922 cudaSafeCall( cudaDeviceSynchronize() );
924 normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
925 cudaSafeCall( cudaGetLastError() );
927 cudaSafeCall( cudaDeviceSynchronize() );
931 }}} // namespace cv { namespace gpu { namespace device
934 #endif /* CUDA_DISABLER */