From 58f691979537d4beeee6a6980393c2b822d24f76 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Thu, 10 Mar 2011 13:53:58 +0000 Subject: [PATCH] made GPU version of SURF more consistent with CPU one --- modules/gpu/include/opencv2/gpu/gpu.hpp | 68 +- modules/gpu/src/cuda/internal_shared.hpp | 12 +- modules/gpu/src/cuda/surf.cu | 1399 +++++++++++++----------------- modules/gpu/src/surf.cpp | 351 ++++---- modules/gpu/test/test_features2d.cpp | 146 ++-- samples/gpu/performance/tests.cpp | 9 +- samples/gpu/surf_keypoint_matcher.cpp | 8 +- 7 files changed, 859 insertions(+), 1134 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 33fa3b4..854b89d 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1537,83 +1537,55 @@ namespace cv }; ////////////////////////////////// SURF ////////////////////////////////////////// - - struct CV_EXPORTS SURFParams_GPU - { - SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), - l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f), - edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {} - - //! The interest operator threshold - float threshold; - //! The number of octaves to process - int nOctaves; - //! The number of intervals in each octave - int nIntervals; - //! The scale associated with the first interval of the first octave - float initialScale; - - //! mask parameter l_1 - float l1; - //! mask parameter l_2 - float l2; - //! mask parameter l_3 - float l3; - //! mask parameter l_4 - float l4; - //! The amount to scale the edge rejection mask - float edgeScale; - //! The initial sampling step in pixels. - int initialStep; - - //! True, if generate 128-len descriptors, false - 64-len descriptors - bool extended; - - //! max features = featuresRatio * img.size().srea() - float featuresRatio; - }; - class CV_EXPORTS SURF_GPU : public SURFParams_GPU + class CV_EXPORTS SURF_GPU : public CvSURFParams { public: + //! the default constructor + SURF_GPU(); + //! the full constructor taking all the necessary parameters + explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4, + int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f); + //! returns the descriptor size in float's (64 or 128) int descriptorSize() const; //! upload host keypoints to device memory - static void uploadKeypoints(const vector& keypoints, GpuMat& keypointsGPU); + void uploadKeypoints(const vector& keypoints, GpuMat& keypointsGPU); //! download keypoints from device to host memory - static void downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints); + void downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints); //! download descriptors from device to host memory - static void downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors); + void downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors); //! finds the keypoints using fast hessian detector used in SURF //! supports CV_8UC1 images //! keypoints will have 1 row and type CV_32FC(6) //! keypoints.at(1, i) contains i'th keypoint - //! format: (x, y, size, response, angle, octave) + //! format: (x, y, laplacian, size, dir, hessian) void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); //! finds the keypoints and computes their descriptors. //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false, bool calcOrientation = true); + bool useProvidedKeypoints = false); void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints); void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false, bool calcOrientation = true); + bool useProvidedKeypoints = false); void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, std::vector& descriptors, - bool useProvidedKeypoints = false, bool calcOrientation = true); + bool useProvidedKeypoints = false); + + //! max keypoints = keypointsRatio * img.size().area() + float keypointsRatio; - GpuMat sum; - GpuMat sumf; + GpuMat sum, mask1, maskSum, intBuffer; - GpuMat mask1; - GpuMat maskSum; + GpuMat det, trace; - GpuMat hessianBuffer; GpuMat maxPosBuffer; GpuMat featuresBuffer; + GpuMat keypointsBuffer; }; } diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp index e82700e..d2428c6 100644 --- a/modules/gpu/src/cuda/internal_shared.hpp +++ b/modules/gpu/src/cuda/internal_shared.hpp @@ -111,20 +111,20 @@ namespace cv { float x; float y; + float laplacian; float size; - float response; - float angle; - float octave; + float dir; + float hessian; }; enum KeypointLayout { SF_X, SF_Y, + SF_LAPLACIAN, SF_SIZE, - SF_RESPONSE, - SF_ANGLE, - SF_OCTAVE, + SF_DIR, + SF_HESSIAN, SF_FEATURE_STRIDE }; } diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index d8abf1b..bd2a1d9 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -56,267 +56,132 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace surf { //////////////////////////////////////////////////////////////////////// - // Help funcs - - // Wrapper for host reference to pass into kernel - template - class DeviceReference - { - public: - explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val) - { - cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) ); - cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) ); - } - - ~DeviceReference() - { - cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaFree(d_ptr) ); - } - - // Casting to device pointer - operator T*() {return d_ptr;} - operator const T*() const {return d_ptr;} - private: - T* d_ptr; - T* h_ptr; - }; - - __device__ void clearLastBit(int* f) - { - *f &= ~0x1; - } - __device__ void clearLastBit(float& f) - { - clearLastBit((int*)&f); - } - - __device__ void setLastBit(int* f) - { - *f |= 0x1; - } - __device__ void setLastBit(float& f) - { - setLastBit((int*)&f); - } - - //////////////////////////////////////////////////////////////////////// // Global parameters // The maximum number of features (before subpixel interpolation) that memory is reserved for. __constant__ int c_max_candidates; // The maximum number of features that memory is reserved for. __constant__ int c_max_features; - // The number of intervals in the octave. - __constant__ int c_nIntervals; - // Mask sizes derived from the mask parameters - __constant__ float c_mask_width; - // Mask sizes derived from the mask parameters - __constant__ float c_mask_height; - // Mask sizes derived from the mask parameters - __constant__ float c_dxy_center_offset; - // Mask sizes derived from the mask parameters - __constant__ float c_dxy_half_width; - // Mask sizes derived from the mask parameters - __constant__ float c_dxy_scale; - // The scale associated with the first interval of the first octave - __constant__ float c_initialScale; - // The interest operator threshold - __constant__ float c_threshold; - - // Ther octave + // The maximum number of keypoints that memory is reserved for. + __constant__ int c_max_keypoints; + // The image size. + __constant__ int c_img_rows; + __constant__ int c_img_cols; + // The number of layers. + __constant__ int c_nOctaveLayers; + // The hessian threshold. + __constant__ float c_hessianThreshold; + + // The current octave. __constant__ int c_octave; - // The width of the octave buffer. - __constant__ int c_x_size; - // The height of the octave buffer. - __constant__ int c_y_size; - // The size of the octave border in pixels. - __constant__ int c_border; - // The step size used in this octave in pixels. - __constant__ int c_step; + // The current layer size. + __constant__ int c_layer_rows; + __constant__ int c_layer_cols; //////////////////////////////////////////////////////////////////////// // Integral image texture - texture sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp); + typedef texture IntTex; - __device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight) - { - float result = 0.f; - - result += tex2D(sumTex, cx - halfWidth, cy - halfHeight); - result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight); - result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight); - result += tex2D(sumTex, cx + halfWidth, cy + halfHeight); - - return result; - } - - //////////////////////////////////////////////////////////////////////// - // Hessian + IntTex sumTex(0, cudaFilterModePoint, cudaAddressModeClamp); - __device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale) + template + __device__ float icvCalcHaarPattern(const IntTex& tex, const float src[][5], int oldSize, int newSize, int y, int x) { - float Dyy = 0.f; - - Dyy += iiAreaLookupCDHalfWH(x, y, mask_width, mask_height); - Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale); - - Dyy *= 1.0f / (fscale * fscale); + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 + typedef double real_t; + #else + typedef float real_t; + #endif - return Dyy; - } + float ratio = (float)newSize / oldSize; + + real_t d = 0; - __device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale) - { - float Dxx = 0.f; + #pragma unroll + for (int k = 0; k < N; ++k) + { + int dx1 = __float2int_rn(ratio * src[k][0]); + int dy1 = __float2int_rn(ratio * src[k][1]); + int dx2 = __float2int_rn(ratio * src[k][2]); + int dy2 = __float2int_rn(ratio * src[k][3]); - Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width); - Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width); + real_t t = 0; + t += tex2D(tex, x + dx1, y + dy1); + t -= tex2D(tex, x + dx1, y + dy2); + t -= tex2D(tex, x + dx2, y + dy1); + t += tex2D(tex, x + dx2, y + dy2); - Dxx *= 1.0f / (fscale * fscale); + d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1)); + } - return Dxx; + return (float)d; } - __device__ float evalDxy(float x, float y, float fscale) - { - float center_offset = c_dxy_center_offset * fscale; - float half_width = c_dxy_half_width * fscale; - - float Dxy = 0.f; - - Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width); - Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width); - Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width); - Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width); - - Dxy *= 1.0f / (fscale * fscale); + //////////////////////////////////////////////////////////////////////// + // Hessian - return Dxy; - } + __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} }; + __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} }; + __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} }; - __device__ float calcScale(int hidx_z) + __host__ __device__ int calcSize(int octave, int layer) { - float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2); - return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f; - } - - __global__ void fasthessian(PtrStepf hessianBuffer) - { - // Determine the indices in the Hessian buffer - int hidx_x = threadIdx.x + blockIdx.x * blockDim.x; - int hidx_y = threadIdx.y + blockIdx.y * blockDim.y; - int hidx_z = threadIdx.z; - - float fscale = calcScale(hidx_z); + /* Wavelet size at first layer of first octave. */ + const int HAAR_SIZE0 = 9; - // Compute the lookup location of the mask center - float x = hidx_x * c_step + c_border; - float y = hidx_y * c_step + c_border; + /* Wavelet size increment between layers. This should be an even number, + such that the wavelet sizes in an octave are either all even or all odd. + This ensures that when looking for the neighbours of a sample, the layers + above and below are aligned correctly. */ + const int HAAR_SIZE_INC = 6; - // Scale the mask dimensions according to the scale - if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals) - { - float mask_width = c_mask_width * fscale; - float mask_height = c_mask_height * fscale; - - // Compute the filter responses - float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale); - float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale); - float Dxy = evalDxy(x, y, fscale); - - // Combine the responses and store the Laplacian sign - float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy); - - if (Dxx + Dyy > 0.f) - setLastBit(result); - else - clearLastBit(result); - - hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result; - } + return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } - __global__ void fasthessian_old(PtrStepf hessianBuffer) + __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) { - // Determine the indices in the Hessian buffer - int gridDim_y = gridDim.y / c_nIntervals; - int blockIdx_y = blockIdx.y % gridDim_y; - int blockIdx_z = blockIdx.y / gridDim_y; - - int hidx_x = threadIdx.x + blockIdx.x * blockDim.x; - int hidx_y = threadIdx.y + blockIdx_y * blockDim.y; - int hidx_z = blockIdx_z; + // Determine the indices + const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); + const int blockIdx_y = blockIdx.y % gridDim_y; + const int blockIdx_z = blockIdx.y / gridDim_y; - float fscale = calcScale(hidx_z); + const int j = threadIdx.x + blockIdx.x * blockDim.x; + const int i = threadIdx.y + blockIdx_y * blockDim.y; + const int layer = blockIdx_z; - // Compute the lookup location of the mask center - float x = hidx_x * c_step + c_border; - float y = hidx_y * c_step + c_border; + const int size = calcSize(c_octave, layer); - // Scale the mask dimensions according to the scale - if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals) - { - float mask_width = c_mask_width * fscale; - float mask_height = c_mask_height * fscale; - - // Compute the filter responses - float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale); - float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale); - float Dxy = evalDxy(x, y, fscale); + const int samples_i = 1 + ((c_img_rows - size) >> c_octave); + const int samples_j = 1 + ((c_img_cols - size) >> c_octave); - // Combine the responses and store the Laplacian sign - float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy); + /* Ignore pixels where some of the kernel is outside the image */ + const int margin = (size >> 1) >> c_octave; - if (Dxx + Dyy > 0.f) - setLastBit(result); - else - clearLastBit(result); + if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) + { + const float dx = icvCalcHaarPattern<3>(sumTex, c_DX , 9, size, i << c_octave, j << c_octave); + const float dy = icvCalcHaarPattern<3>(sumTex, c_DY , 9, size, i << c_octave, j << c_octave); + const float dxy = icvCalcHaarPattern<4>(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave); - hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result; + det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; + trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; } } - dim3 calcBlockSize(int nIntervals) - { - int threadsPerBlock = 512; - - dim3 threads; - threads.z = nIntervals; - threadsPerBlock /= nIntervals; - if (threadsPerBlock >= 48) - threads.x = 16; - else - threads.x = 8; - threadsPerBlock /= threads.x; - threads.y = threadsPerBlock; - - return threads; - } - - void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads) + void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers) { - dim3 grid; - grid.x = divUp(x_size, threads.x); - grid.y = divUp(y_size, threads.y); - - fasthessian<<>>(hessianBuffer); - cudaSafeCall( cudaGetLastError() ); + const int min_size = calcSize(octave, 0); + const int max_samples_i = 1 + ((img_rows - min_size) >> octave); + const int max_samples_j = 1 + ((img_cols - min_size) >> octave); - cudaSafeCall( cudaThreadSynchronize() ); - } - - void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld) - { dim3 threads(16, 16); dim3 grid; - grid.x = divUp(x_size, threads.x); - grid.y = divUp(y_size, threads.y) * threadsOld.z; - - fasthessian_old<<>>(hessianBuffer); + grid.x = divUp(max_samples_j, threads.x); + grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); + + icvCalcLayerDetAndTrace<<>>(det, trace); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // NONMAX - texture maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp); + IntTex maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp); struct WithOutMask { - static __device__ bool check(float, float, float) + static __device__ bool check(int, int, int) { return true; } }; + + __constant__ float c_DM[1][5] = {{0, 0, 9, 9, 1}}; + struct WithMask { - static __device__ bool check(float x, float y, float fscale) + static __device__ bool check(int sum_i, int sum_j, int size) { - float half_width = fscale / 2; - - float result = 0.f; - - result += tex2D(maskSumTex, x - half_width, y - half_width); - result -= tex2D(maskSumTex, x + half_width, y - half_width); - result -= tex2D(maskSumTex, x - half_width, y + half_width); - result += tex2D(maskSumTex, x + half_width, y + half_width); - - result /= (fscale * fscale); - - return (result >= 0.5f); + float mval = icvCalcHaarPattern<1>(maskSumTex, c_DM , 9, size, sum_i, sum_j); + return (mval >= 0.5); } }; template - __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter) + __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - extern __shared__ float fh_vals[]; + extern __shared__ float N9[]; // The hidx variables are the indices to the hessian buffer. - int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2); - int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2); - int hidx_z = threadIdx.z; - int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + const int gridDim_y = gridDim.y / c_nOctaveLayers; + const int blockIdx_y = blockIdx.y % gridDim_y; + const int blockIdx_z = blockIdx.y / gridDim_y; - // Is this thread within the hessian buffer? - if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals) - { - fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x]; - } - __syncthreads(); + const int layer = blockIdx_z + 1; + + const int size = calcSize(c_octave, layer); - // Is this location one of the ones being processed for nonmax suppression. - // Blocks overlap by one so we don't process the border threads. - bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1 - && threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1 - && threadIdx.z > 0 && threadIdx.z < blockDim.z-1; + /* Ignore pixels without a 3x3x3 neighbourhood in the layer above */ + const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1; - float val = fh_vals[localLin]; + const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1; + const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1; - // Compute the lookup location of the mask center - float x = hidx_x * c_step + c_border; - float y = hidx_y * c_step + c_border; - float fscale = calcScale(hidx_z); + // Is this thread within the hessian buffer? + const int zoff = blockDim.x * blockDim.y; + const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff; + N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + i)[j]; + N9[localLin ] = det.ptr(c_layer_rows * (layer ) + i)[j]; + N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + i)[j]; + __syncthreads(); - if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale)) + if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1) { - // Check to see if we have a max (in its 26 neighbours) - int zoff = blockDim.x * blockDim.y; - bool condmax = val > fh_vals[localLin + 1] - && val > fh_vals[localLin - 1] - && val > fh_vals[localLin - blockDim.x + 1] - && val > fh_vals[localLin - blockDim.x ] - && val > fh_vals[localLin - blockDim.x - 1] - && val > fh_vals[localLin + blockDim.x + 1] - && val > fh_vals[localLin + blockDim.x ] - && val > fh_vals[localLin + blockDim.x - 1] - - && val > fh_vals[localLin - zoff + 1] - && val > fh_vals[localLin - zoff ] - && val > fh_vals[localLin - zoff - 1] - && val > fh_vals[localLin - zoff - blockDim.x + 1] - && val > fh_vals[localLin - zoff - blockDim.x ] - && val > fh_vals[localLin - zoff - blockDim.x - 1] - && val > fh_vals[localLin - zoff + blockDim.x + 1] - && val > fh_vals[localLin - zoff + blockDim.x ] - && val > fh_vals[localLin - zoff + blockDim.x - 1] - - && val > fh_vals[localLin + zoff + 1] - && val > fh_vals[localLin + zoff ] - && val > fh_vals[localLin + zoff - 1] - && val > fh_vals[localLin + zoff - blockDim.x + 1] - && val > fh_vals[localLin + zoff - blockDim.x ] - && val > fh_vals[localLin + zoff - blockDim.x - 1] - && val > fh_vals[localLin + zoff + blockDim.x + 1] - && val > fh_vals[localLin + zoff + blockDim.x ] - && val > fh_vals[localLin + zoff + blockDim.x - 1] - ; - - if(condmax) + float val0 = N9[localLin]; + + if (val0 > c_hessianThreshold) { - unsigned int i = atomicInc(maxCounter,(unsigned int) -1); + // Coordinates for the start of the wavelet in the sum image. There + // is some integer division involved, so don't try to simplify this + // (cancel out sampleStep) without checking the result is the same + const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; + const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - if (i < c_max_candidates) + if (Mask::check(sum_i, sum_j, size)) { - int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave}; - maxPosBuffer[i] = f; + // Check to see if we have a max (in its 26 neighbours) + const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] + && val0 > N9[localLin - blockDim.x - zoff] + && val0 > N9[localLin + 1 - blockDim.x - zoff] + && val0 > N9[localLin - 1 - zoff] + && val0 > N9[localLin - zoff] + && val0 > N9[localLin + 1 - zoff] + && val0 > N9[localLin - 1 + blockDim.x - zoff] + && val0 > N9[localLin + blockDim.x - zoff] + && val0 > N9[localLin + 1 + blockDim.x - zoff] + + && val0 > N9[localLin - 1 - blockDim.x] + && val0 > N9[localLin - blockDim.x] + && val0 > N9[localLin + 1 - blockDim.x] + && val0 > N9[localLin - 1 ] + && val0 > N9[localLin + 1 ] + && val0 > N9[localLin - 1 + blockDim.x] + && val0 > N9[localLin + blockDim.x] + && val0 > N9[localLin + 1 + blockDim.x] + + && val0 > N9[localLin - 1 - blockDim.x + zoff] + && val0 > N9[localLin - blockDim.x + zoff] + && val0 > N9[localLin + 1 - blockDim.x + zoff] + && val0 > N9[localLin - 1 + zoff] + && val0 > N9[localLin + zoff] + && val0 > N9[localLin + 1 + zoff] + && val0 > N9[localLin - 1 + blockDim.x + zoff] + && val0 > N9[localLin + blockDim.x + zoff] + && val0 > N9[localLin + 1 + blockDim.x + zoff] + ; + + if(condmax) + { + unsigned int ind = atomicInc(maxCounter,(unsigned int) -1); + + if (ind < c_max_candidates) + { + const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]); + + maxPosBuffer[ind] = make_int4(j, i, layer, laplacian); + } + } } } } @@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf #endif } - void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, - int x_size, int y_size, bool use_mask, const dim3& threads) + void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, + int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) { - dim3 grid; - grid.x = divUp(x_size, threads.x - 2); - grid.y = divUp(y_size, threads.y - 2); + const int layer_rows = img_rows >> octave; + const int layer_cols = img_cols >> octave; + + int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1; + + dim3 threads(16, 16); - const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float); + dim3 grid; + grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2); + grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers; - DeviceReference maxCounterWrapper(maxCounter); + const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); if (use_mask) - nonmaxonly<<>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); else - nonmaxonly<<>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); cudaSafeCall( cudaGetLastError() ); @@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // INTERPOLATION - #define MID_IDX 1 - __global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer, - KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter) + __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x; - int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y; - int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z; + const int4 maxPos = maxPosBuffer[blockIdx.x]; + + const int j = maxPos.x - 1 + threadIdx.x; + const int i = maxPos.y - 1 + threadIdx.y; + const int layer = maxPos.z - 1 + threadIdx.z; - __shared__ float fh_vals[3][3][3]; + __shared__ float N9[3][3][3]; __shared__ KeyPoint_GPU p; - fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x]; + N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j]; __syncthreads(); if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { - __shared__ float H[3][3]; - - //dxx - H[0][0] = fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] - - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] - + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]; + __shared__ float dD[3]; - //dyy - H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] - - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] - + fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]; + //dx + dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]); + //dy + dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]); + //ds + dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]); - //dss - H[2][2] = fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] - - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] - + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]; + __shared__ float H[3][3]; + //dxx + H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; //dxy - H[0][1]= 0.25f* - (fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX + 1] - - fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] - - fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] + - fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]); - + H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]); //dxs - H[0][2]= 0.25f* - (fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX ] - - fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] - - fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] + - fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]); - - //dys - H[1][2]= 0.25f* - (fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX + 1] - - fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] - - fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] + - fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]); - + H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]); //dyx = dxy H[1][0] = H[0][1]; - + //dyy + H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1]; + //dys + H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]); //dsx = dxs H[2][0] = H[0][2]; - //dsy = dys H[2][1] = H[1][2]; + //dss + H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; - __shared__ float dD[3]; + float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) + - H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) + + H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]); - //dx - dD[0] = 0.5f*(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] - - fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]); - //dy - dD[1] = 0.5f*(fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] - - fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]); - //ds - dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] - - fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]); - - __shared__ float invdet; - invdet = 1.f / - ( - H[0][0]*H[1][1]*H[2][2] - + H[0][1]*H[1][2]*H[2][0] - + H[0][2]*H[1][0]*H[2][1] - - H[0][0]*H[1][2]*H[2][1] - - H[0][1]*H[1][0]*H[2][2] - - H[0][2]*H[1][1]*H[2][0] - ); - - // // 1-based entries of a 3x3 inverse - // /* [ |a22 a23| |a12 a13| |a12 a13|] */ - // /* [ |a32 a33| -|a32 a33| |a22 a23|] */ - // /* [ ] */ - // /* [ |a21 a23| |a11 a13| |a11 a13|] */ - // /* A^(-1) = [-|a31 a33| |a31 a33| -|a21 a23|] / d */ - // /* [ ] */ - // /* [ |a21 a22| |a11 a12| |a11 a12|] */ - // /* [ |a31 a32| -|a31 a32| |a21 a22|] */ - - __shared__ float Hinv[3][3]; - Hinv[0][0] = invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]); - Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]); - Hinv[0][2] = invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]); - - Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]); - Hinv[1][1] = invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]); - Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]); - - Hinv[2][0] = invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]); - Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]); - Hinv[2][2] = invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]); - - __shared__ float x[3]; - - x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2])); - x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2])); - x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2])); - - if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f) + if (det != 0.0f) { - // if the step is within the interpolation region, perform it + float invdet = 1.0f / det; - // Get a new feature index. - unsigned int i = atomicInc(featureCounter, (unsigned int)-1); + __shared__ float x[3]; - if (i < c_max_features) + x[0] = invdet * + (dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) - + H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) + + H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2])); + + x[1] = invdet * + (H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) - + dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) + + H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0])); + + x[2] = invdet * + (H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) - + H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) + + dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0])); + + if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f) { - p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border; - p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border; + // if the step is within the interpolation region, perform it - if (x[2] > 0) - { - float a = calcScale(maxPosBuffer[blockIdx.x].z); - float b = calcScale(maxPosBuffer[blockIdx.x].z + 1); + // Get a new feature index. + unsigned int ind = atomicInc(featureCounter, (unsigned int)-1); - p.size = (1.f - x[2]) * a + x[2] * b; - } - else + if (ind < c_max_features) { - float a = calcScale(maxPosBuffer[blockIdx.x].z); - float b = calcScale(maxPosBuffer[blockIdx.x].z - 1); + const int size = calcSize(c_octave, maxPos.z); - p.size = (1.f + x[2]) * a - x[2] * b; - } + const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave; + const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave; + + const float center_i = sum_i + (float)(size - 1) / 2; + const float center_j = sum_j + (float)(size - 1) / 2; - p.octave = c_octave; + p.x = center_j + x[0] * (1 << c_octave); + p.y = center_i + x[1] * (1 << c_octave); - p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX]; + int ds = size - calcSize(c_octave, maxPos.z - 1); + p.size = roundf(size + x[2] * ds); - // Should we split up this transfer over many threads? - featuresBuffer[i] = p; - } - } // If the subpixel interpolation worked + p.laplacian = maxPos.w; + p.dir = 0.0f; + p.hessian = N9[1][1][1]; + + // Should we split up this transfer over many threads? + featuresBuffer[ind] = p; + } + } // If the subpixel interpolation worked + } } // If this is thread 0. #endif } - #undef MID_IDX - void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, - KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter) + void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter) { dim3 threads; threads.x = 3; @@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf dim3 grid; grid.x = maxCounter; - DeviceReference featureCounterWrapper(featureCounter); - - fh_interp_extremum<<>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper); + icvInterpolateKeypoint<<>>(det, maxPosBuffer, featuresBuffer, featureCounter); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // Orientation - // precomputed values for a Gaussian with a standard deviation of 2 - __constant__ float c_gauss1D[13] = - { - 0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f, - 0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f, - 0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f, - 0.002215924206f - }; + #define ORI_SEARCH_INC 5 + #define ORI_WIN 60 + #define ORI_SAMPLES 113 - __global__ void find_orientation(KeyPoint_GPU* features) - { - int tid = threadIdx.y * 17 + threadIdx.x; - int tid2 = numeric_limits_gpu::max(); + __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; + __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; + __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f}; + + __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; + __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; - if (threadIdx.x < 13 && threadIdx.y < 13) - { - tid2 = threadIdx.y * 13 + threadIdx.x; - } + __global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ float texLookups[17][17]; + __shared__ float s_X[128]; + __shared__ float s_Y[128]; + __shared__ float s_angle[128]; - __shared__ float Edx[13*13]; - __shared__ float Edy[13*13]; - __shared__ float xys[3]; + __shared__ float s_sumx[64 * 4]; + __shared__ float s_sumy[64 * 4]; - // Read my x, y, size. - if (tid < 3) - { - xys[tid] = ((float*)(&features[blockIdx.x]))[tid]; - } + __shared__ float s_feature[6]; + + if (threadIdx.x < 6 && threadIdx.y == 0) + s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x]; __syncthreads(); - // Read all texture locations into memory - // Maybe I should use __mul24 here? - texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE], - xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]); - __syncthreads(); + /* The sampling intervals and wavelet sized for selecting an orientation + and building the keypoint descriptor are defined relative to 's' */ + const float s = s_feature[SF_SIZE] * 1.2f / 9.0f; - float dx = 0.f; - float dy = 0.f; + /* To find the dominant orientation, the gradients in x and y are + sampled in a circle of radius 6s using wavelets of size 4s. + We ensure the gradient wavelet size is even to ensure the + wavelet pattern is balanced and symmetric around its center */ + const int grad_wav_size = 2 * __float2int_rn(2.0f * s); - // Computes lookups for all points in a 13x13 lattice. - // - SURF says to only use a circle, but the branching logic would slow it down - // - Gaussian weighting should reduce the effects of the outer points anyway - if (tid2 < 169) + // check when grad_wav_size is too big + if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) { - dx -= texLookups[threadIdx.x ][threadIdx.y ]; - dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ]; - dx -= texLookups[threadIdx.x + 4][threadIdx.y ]; - dx += texLookups[threadIdx.x ][threadIdx.y + 4]; - dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4]; - dx += texLookups[threadIdx.x + 4][threadIdx.y + 4]; - - dy -= texLookups[threadIdx.x ][threadIdx.y ]; - dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2]; - dy -= texLookups[threadIdx.x ][threadIdx.y + 4]; - dy += texLookups[threadIdx.x + 4][threadIdx.y ]; - dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2]; - dy += texLookups[threadIdx.x + 4][threadIdx.y + 4]; - - float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y]; - - Edx[tid2] = dx * g; - Edy[tid2] = dy * g; - } + // Calc X, Y, angle and store it to shared memory + { + const int tid = threadIdx.y * blockDim.x + threadIdx.x; - __syncthreads(); + float X = 0.0f, Y = 0.0f, angle = 0.0f; - // This is a scan to get the summed dx, dy values. - // Gets 128-168 - if (tid < 41) - { - Edx[tid] += Edx[tid + 128]; - } - __syncthreads(); - if (tid < 64) - { - Edx[tid] += Edx[tid + 64]; - } - __syncthreads(); - if (tid < 32) - { - volatile float* smem = Edx; - - smem[tid] += smem[tid + 32]; - smem[tid] += smem[tid + 16]; - smem[tid] += smem[tid + 8]; - smem[tid] += smem[tid + 4]; - smem[tid] += smem[tid + 2]; - smem[tid] += smem[tid + 1]; - } + if (tid < ORI_SAMPLES) + { + const float margin = (float)(grad_wav_size - 1) / 2.0f; + const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin); + const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin); - // Gets 128-168 - if (tid < 41) - { - Edy[tid] += Edy[tid + 128]; - } - __syncthreads(); - if (tid < 64) - { - Edy[tid] += Edy[tid + 64]; - } - __syncthreads(); - if (tid < 32) - { - volatile float* smem = Edy; - - smem[tid] += smem[tid + 32]; - smem[tid] += smem[tid + 16]; - smem[tid] += smem[tid + 8]; - smem[tid] += smem[tid + 4]; - smem[tid] += smem[tid + 2]; - smem[tid] += smem[tid + 1]; - } + if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size)) + { + X = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NX, 4, grad_wav_size, y, x); + Y = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NY, 4, grad_wav_size, y, x); + + angle = atan2f(Y, X); + if (angle < 0) + angle += 2.0f * CV_PI; + angle *= 180.0f / CV_PI; + } + } + if (tid < 128) + { + s_X[tid] = X; + s_Y[tid] = Y; + s_angle[tid] = angle; + } + } + __syncthreads(); - // Thread 0 saves back the result. - if (tid == 0) - { - features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI); + float bestx = 0, besty = 0, best_mod = 0; + + #pragma unroll + for (int i = 0; i < 18; ++i) + { + const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC; + + float sumx = 0.0f, sumy = 0.0f; + int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir); + if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) + { + sumx = s_X[threadIdx.x]; + sumy = s_Y[threadIdx.x]; + } + d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir); + if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) + { + sumx += s_X[threadIdx.x + 64]; + sumy += s_Y[threadIdx.x + 64]; + } + + float* s_sumx_row = s_sumx + threadIdx.y * 64; + float* s_sumy_row = s_sumy + threadIdx.y * 64; + + s_sumx_row[threadIdx.x] = sumx; + s_sumy_row[threadIdx.x] = sumy; + __syncthreads(); + + if (threadIdx.x < 32) + { + volatile float* v_sumx_row = s_sumx_row; + volatile float* v_sumy_row = s_sumy_row; + + v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32]; + v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32]; + + v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16]; + v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16]; + + v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8]; + v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8]; + + v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4]; + v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4]; + + v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2]; + v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2]; + + v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1]; + v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1]; + } + + const float temp_mod = sumx * sumx + sumy * sumy; + if (temp_mod > best_mod) + { + best_mod = temp_mod; + bestx = sumx; + besty = sumy; + } + __syncthreads(); + } + + if (threadIdx.x == 0) + { + s_X[threadIdx.y] = bestx; + s_Y[threadIdx.y] = besty; + s_angle[threadIdx.y] = best_mod; + } + __syncthreads(); + + if (threadIdx.x < 2 && threadIdx.y == 0) + { + volatile float* v_x = s_X; + volatile float* v_y = s_Y; + volatile float* v_mod = s_angle; + + bestx = v_x[threadIdx.x]; + besty = v_y[threadIdx.x]; + best_mod = v_mod[threadIdx.x]; + + float temp_mod = v_mod[threadIdx.x + 2]; + if (temp_mod > best_mod) + { + v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2]; + v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2]; + v_mod[threadIdx.x] = best_mod = temp_mod; + } + temp_mod = v_mod[threadIdx.x + 1]; + if (temp_mod > best_mod) + { + v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1]; + v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1]; + } + } + + if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0) + { + // Get a new feature index. + unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1); + + if (ind < c_max_keypoints) + { + float kp_dir = atan2f(besty, bestx); + if (kp_dir < 0) + kp_dir += 2.0f * CV_PI; + kp_dir *= 180.0f / CV_PI; + __shared__ KeyPoint_GPU kp; + + kp.x = s_feature[SF_X]; + kp.y = s_feature[SF_Y]; + kp.laplacian = s_feature[SF_LAPLACIAN]; + kp.size = s_feature[SF_SIZE]; + kp.dir = kp_dir; + kp.hessian = s_feature[SF_HESSIAN]; + + keypoints[ind] = kp; + } + } } + + #endif } - void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures) + #undef ORI_SEARCH_INC + #undef ORI_WIN + #undef ORI_SAMPLES + + void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter) { dim3 threads; - threads.x = 17; - threads.y = 17; + threads.x = 64; + threads.y = 4; dim3 grid; grid.x = nFeatures; - find_orientation<<>>(features); + icvCalcOrientation<<>>(featureBuffer, keypoints, keypointCounter); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // Descriptors - // precomputed values for a Gaussian with a standard deviation of 3.3 - // - it appears SURF uses a different value, but not sure what it is - __constant__ float c_3p3gauss1D[20] = - { - 0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f, - 0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f, - 0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f, - 0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f + #define PATCH_SZ 20 + + texture imgTex(0, cudaFilterModePoint, cudaAddressModeClamp); + + __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = + { + 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, + 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, + 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, + 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, + 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, + 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, + 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, + 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, + 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, + 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, + 0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, + 0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, + 0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, + 0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, + 9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, + 5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, + 3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, + 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, + 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, + 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f }; - template - __global__ void normalize_descriptors(PtrStepf descriptors) + __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2) { - // no need for thread ID - float* descriptor_base = descriptors.ptr(blockIdx.x); + const float centerX = s_pt[SF_X]; + const float centerY = s_pt[SF_Y]; + const float size = s_pt[SF_SIZE]; + const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180); - // read in the unnormalized descriptor values (squared) - __shared__ float sqDesc[BLOCK_DIM_X]; - const float lookup = descriptor_base[threadIdx.x]; - sqDesc[threadIdx.x] = lookup * lookup; - __syncthreads(); + /* The sampling intervals and wavelet sized for selecting an orientation + and building the keypoint descriptor are defined relative to 's' */ + const float s = size * 1.2f / 9.0f; - if (BLOCK_DIM_X >= 128) - { - if (threadIdx.x < 64) - sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64]; - __syncthreads(); - } + /* Extract a window of pixels around the keypoint of size 20s */ + const int win_size = (int)((PATCH_SZ + 1) * s); - // reduction to get total - if (threadIdx.x < 32) - { - volatile float* smem = sqDesc; + float sin_dir; + float cos_dir; + sincosf(descriptor_dir, &sin_dir, &cos_dir); - smem[threadIdx.x] += smem[threadIdx.x + 32]; - smem[threadIdx.x] += smem[threadIdx.x + 16]; - smem[threadIdx.x] += smem[threadIdx.x + 8]; - smem[threadIdx.x] += smem[threadIdx.x + 4]; - smem[threadIdx.x] += smem[threadIdx.x + 2]; - smem[threadIdx.x] += smem[threadIdx.x + 1]; - } + /* Nearest neighbour version (faster) */ + const float win_offset = -(float)(win_size - 1) / 2; - // compute length (square root) - __shared__ float len; - if (threadIdx.x == 0) - { - len = sqrtf(sqDesc[0]); - } - __syncthreads(); + /* Scale the window to size PATCH_SZ so each pixel's size is s. This + makes calculating the gradients with wavelets of size 2s easy */ + const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size; + const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size; - // normalize and store in output - descriptor_base[threadIdx.x] = lookup / len; - } + const int i = __float2int_rd(icoo); + const int j = __float2int_rd(jcoo); - __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt, - int xIndex, int yIndex, int tid) - { - float sin_theta, cos_theta; - sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta); - - // Compute rotated sampling points - // (clockwise rotation since we are rotating the lattice) - // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel) - const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] - + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); - const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] - + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); - - // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy) - // a b c - // d f - // g h i - const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); - const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]); - const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); - const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y); - const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y); - const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); - const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]); - const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); - - // compute axis-aligned HaarX, HaarY - // (could group the additions together into multiplications) - const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular) - const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx - const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy - - // rotate responses (store all dxs then all dys) - // - counterclockwise rotation to rotate back to zero orientation - sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx - sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy + float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; + float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; + + float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo); + + pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir; + pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir; + + res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo); + + pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir; + pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir; + + res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j); + + pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir; + pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir; + + res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j); + + s_PATCH[i2][j2] = (unsigned char)res; } - __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features) + __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid) { // get the interest point parameters (x, y, size, response, angle) - __shared__ float ipt[5]; - if (threadIdx.x < 5 && threadIdx.y == 0) + __shared__ float s_pt[5]; + if (tid < 5) { - ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x]; + s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid]; } __syncthreads(); // Compute sampling points // since grids are 2D, need to compute xBlock and yBlock indices - const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4 - const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4) - const int xIndex = (xBlock * 5) + (threadIdx.x % 5); - const int yIndex = (yBlock * 5) + (threadIdx.x / 5); + const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4 + const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4) + const int xIndex = xBlock * blockDim.x + threadIdx.x; + const int yIndex = yBlock * blockDim.y + threadIdx.y; + + calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x); + if (threadIdx.x == 0) + calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5); + if (threadIdx.y == 0) + calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x); + if (threadIdx.x == 0 && threadIdx.y == 0) + calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5); + __syncthreads(); + + const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; + + const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw; + const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw; - calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x); + s_dx_bin[tid] = vx; + s_dy_bin[tid] = vy; } __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, @@ -934,192 +837,8 @@ namespace cv { namespace gpu { namespace surf // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) { - // 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region) - __shared__ float sdx [16 * 25]; - __shared__ float sdy [16 * 25]; - __shared__ float sdxabs[16 * 25]; - __shared__ float sdyabs[16 * 25]; - - __shared__ float sdesc[64]; - - float* sdx_bin = sdx + (threadIdx.y * 25); - float* sdy_bin = sdy + (threadIdx.y * 25); - float* sdxabs_bin = sdxabs + (threadIdx.y * 25); - float* sdyabs_bin = sdyabs + (threadIdx.y * 25); - - calc_dx_dy(sdx_bin, sdy_bin, features); - __syncthreads(); - - sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array - sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array - __syncthreads(); - - reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x); - __syncthreads(); - - float* sdesc_bin = sdesc + (threadIdx.y << 2); - - // write dx, dy, |dx|, |dy| - if (threadIdx.x == 0) - { - sdesc_bin[0] = sdx_bin[0]; - sdesc_bin[1] = sdy_bin[0]; - sdesc_bin[2] = sdxabs_bin[0]; - sdesc_bin[3] = sdyabs_bin[0]; - } - __syncthreads(); - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - if (tid < 64) - descriptors.ptr(blockIdx.x)[tid] = sdesc[tid]; - } - - // Spawn 16 blocks per interest point - // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location - __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) - { - // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) - __shared__ float sdx[16 * 25]; - __shared__ float sdy[16 * 25]; - - // sum (reduce) 5x5 area response - __shared__ float sd1[16 * 25]; - __shared__ float sd2[16 * 25]; - __shared__ float sdabs1[16 * 25]; - __shared__ float sdabs2[16 * 25]; - - __shared__ float sdesc[128]; - - float* sdx_bin = sdx + (threadIdx.y * 25); - float* sdy_bin = sdy + (threadIdx.y * 25); - float* sd1_bin = sd1 + (threadIdx.y * 25); - float* sd2_bin = sd2 + (threadIdx.y * 25); - float* sdabs1_bin = sdabs1 + (threadIdx.y * 25); - float* sdabs2_bin = sdabs2 + (threadIdx.y * 25); - - calc_dx_dy(sdx_bin, sdy_bin, features); - __syncthreads(); - - if (sdy_bin[threadIdx.x] >= 0) - { - sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x]; - sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); - sd2_bin[threadIdx.x] = 0; - sdabs2_bin[threadIdx.x] = 0; - } - else - { - sd1_bin[threadIdx.x] = 0; - sdabs1_bin[threadIdx.x] = 0; - sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x]; - sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]); - } - __syncthreads(); - - reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x); - __syncthreads(); - - float* sdesc_bin = sdesc + (threadIdx.y << 3); - - // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) - if (threadIdx.x == 0) - { - sdesc_bin[0] = sd1_bin[0]; - sdesc_bin[1] = sdabs1_bin[0]; - sdesc_bin[2] = sd2_bin[0]; - sdesc_bin[3] = sdabs2_bin[0]; - } - __syncthreads(); - - if (sdx_bin[threadIdx.x] >= 0) - { - sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x]; - sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); - sd2_bin[threadIdx.x] = 0; - sdabs2_bin[threadIdx.x] = 0; - } - else - { - sd1_bin[threadIdx.x] = 0; - sdabs1_bin[threadIdx.x] = 0; - sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x]; - sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); - } - __syncthreads(); - - reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x); - __syncthreads(); - - // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) - if (threadIdx.x == 0) - { - sdesc_bin[4] = sd1_bin[0]; - sdesc_bin[5] = sdabs1_bin[0]; - sdesc_bin[6] = sd2_bin[0]; - sdesc_bin[7] = sdabs2_bin[0]; - } - __syncthreads(); - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - if (tid < 128) - descriptors.ptr(blockIdx.x)[tid] = sdesc[tid]; - } - - void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) - { - // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D - - if (descriptors.cols == 64) - { - compute_descriptors64<<>>(descriptors, features); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaThreadSynchronize() ); - - normalize_descriptors<64><<>>(descriptors); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaThreadSynchronize() ); - } - else - { - compute_descriptors128<<>>(descriptors, features); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaThreadSynchronize() ); - - normalize_descriptors<128><<>>(descriptors); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaThreadSynchronize() ); - } - } - - __device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid) - { - // get the interest point parameters (x, y, scale, strength, theta) - __shared__ float ipt[5]; - if (tid < 5) - { - ipt[tid] = ((float*)&features[blockIdx.x])[tid]; - } - __syncthreads(); - - // Compute sampling points - // since grids are 2D, need to compute xBlock and yBlock indices - const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4 - const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4) - const int xIndex = xBlock * blockDim.x + threadIdx.x; - const int yIndex = yBlock * blockDim.y + threadIdx.y; - - calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid); - } - - // Spawn 16 blocks per interest point - // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location - __global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features) - { // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) + __shared__ float s_PATCH[6][6]; __shared__ float sdx[25]; __shared__ float sdy[25]; __shared__ float sdxabs[25]; @@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf const int tid = threadIdx.y * blockDim.x + threadIdx.x; - calc_dx_dy_old(sdx, sdy, features, tid); + calc_dx_dy(s_PATCH, sdx, sdy, features, tid); __syncthreads(); sdxabs[tid] = fabs(sdx[tid]); // |dx| array @@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf // Spawn 16 blocks per interest point // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location - __global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features) + __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) { // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) + __shared__ float s_PATCH[6][6]; __shared__ float sdx[25]; __shared__ float sdy[25]; @@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf const int tid = threadIdx.y * blockDim.x + threadIdx.x; - calc_dx_dy_old(sdx, sdy, features, tid); + calc_dx_dy(s_PATCH, sdx, sdy, features, tid); __syncthreads(); if (sdy[tid] >= 0) @@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf } __syncthreads(); - reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid); + reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); __syncthreads(); float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3); @@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf } __syncthreads(); - reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid); + reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); __syncthreads(); // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) @@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf } } - void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) + template __global__ void normalize_descriptors(PtrStepf descriptors) + { + // no need for thread ID + float* descriptor_base = descriptors.ptr(blockIdx.x); + + // read in the unnormalized descriptor values (squared) + __shared__ float sqDesc[BLOCK_DIM_X]; + const float lookup = descriptor_base[threadIdx.x]; + sqDesc[threadIdx.x] = lookup * lookup; + __syncthreads(); + + if (BLOCK_DIM_X >= 128) + { + if (threadIdx.x < 64) + sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64]; + __syncthreads(); + } + + // reduction to get total + if (threadIdx.x < 32) + { + volatile float* smem = sqDesc; + + smem[threadIdx.x] += smem[threadIdx.x + 32]; + smem[threadIdx.x] += smem[threadIdx.x + 16]; + smem[threadIdx.x] += smem[threadIdx.x + 8]; + smem[threadIdx.x] += smem[threadIdx.x + 4]; + smem[threadIdx.x] += smem[threadIdx.x + 2]; + smem[threadIdx.x] += smem[threadIdx.x + 1]; + } + + // compute length (square root) + __shared__ float len; + if (threadIdx.x == 0) + { + len = sqrtf(sqDesc[0]); + } + __syncthreads(); + + // normalize and store in output + descriptor_base[threadIdx.x] = lookup / len; + } + + void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) { // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D if (descriptors.cols == 64) { - compute_descriptors64_old<<>>(descriptors, features); + compute_descriptors64<<>>(descriptors, features); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf } else { - compute_descriptors128_old<<>>(descriptors, features); + compute_descriptors128<<>>(descriptors, features); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index c16de76..ffdaabb 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -48,123 +48,93 @@ using namespace std; #if !defined (HAVE_CUDA) +cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); } +cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float) { throw_nogpu(); } int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;} void cv::gpu::SURF_GPU::uploadKeypoints(const vector&, GpuMat&) { throw_nogpu(); } void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector&) { throw_nogpu(); } void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector&) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); } void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&, GpuMat&, bool, bool) { throw_nogpu(); } -void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&, vector&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&, GpuMat&, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector&, vector&, bool) { throw_nogpu(); } #else /* !defined (HAVE_CUDA) */ namespace cv { namespace gpu { namespace surf { - dim3 calcBlockSize(int nIntervals); - - void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads); - void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld); - - void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, - int x_size, int y_size, bool use_mask, const dim3& threads); - - void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, - KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter); - - void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures); - + void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers); + + void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, + int img_rows, int img_cols, int octave, bool use_mask, int nLayers); + + void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter); + + void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter); + void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures); - void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures); }}} using namespace cv::gpu::surf; namespace { - class SURF_GPU_Invoker : private SURFParams_GPU + class SURF_GPU_Invoker : private CvSURFParams { public: - SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : - SURFParams_GPU(surf), + SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : + CvSURFParams(surf), - sum(surf.sum), sumf(surf.sumf), + sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace), - mask1(surf.mask1), maskSum(surf.maskSum), - - hessianBuffer(surf.hessianBuffer), - maxPosBuffer(surf.maxPosBuffer), - featuresBuffer(surf.featuresBuffer), + maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer), img_cols(img.cols), img_rows(img.rows), - use_mask(!mask.empty()), - - mask_width(0), mask_height(0), - - featureCounter(0), maxCounter(0) + use_mask(!mask.empty()) { CV_Assert(!img.empty() && img.type() == CV_8UC1); CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1)); - CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22); - CV_Assert(DeviceInfo().supports(GLOBAL_ATOMICS)); - - max_features = static_cast(img.size().area() * featuresRatio); - max_candidates = static_cast(1.5 * max_features); - - CV_Assert(max_features > 0); + CV_Assert(nOctaves > 0 && nOctaveLayers > 0); + CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)); - featuresBuffer.create(1, max_features, CV_32FC(6)); - maxPosBuffer.create(1, max_candidates, CV_32SC4); + maxKeypoints = static_cast(img.size().area() * surf.keypointsRatio); + maxFeatures = static_cast(1.5 * maxKeypoints); + maxCandidates = static_cast(1.5 * maxFeatures); - mask_width = l2 * 0.5f; - mask_height = 1.0f + l1; + CV_Assert(maxKeypoints > 0); + + cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) ); + cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) ); - // Dxy gap half-width - float dxy_center_offset = 0.5f * (l4 + l3); - // Dxy squares half-width - float dxy_half_width = 0.5f * l3; + uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates); + uploadConstant("cv::gpu::surf::c_max_features", maxFeatures); + uploadConstant("cv::gpu::surf::c_max_keypoints", maxKeypoints); + uploadConstant("cv::gpu::surf::c_img_rows", img_rows); + uploadConstant("cv::gpu::surf::c_img_cols", img_cols); + uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers); + uploadConstant("cv::gpu::surf::c_hessianThreshold", static_cast(hessianThreshold)); - // rescale edge_scale to fit with the filter dimensions - float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f); - - // Compute border required such that the filters don't overstep the image boundaries - float smax0 = 2.0f * initialScale + 0.5f; - int border0 = static_cast(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f))); - - int width0 = (img_cols - 2 * border0) / initialStep; - int height0 = (img_rows - 2 * border0) / initialStep; - - uploadConstant("cv::gpu::surf::c_max_candidates", max_candidates); - uploadConstant("cv::gpu::surf::c_max_features", max_features); - uploadConstant("cv::gpu::surf::c_nIntervals", nIntervals); - uploadConstant("cv::gpu::surf::c_mask_width", mask_width); - uploadConstant("cv::gpu::surf::c_mask_height", mask_height); - uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset); - uploadConstant("cv::gpu::surf::c_dxy_half_width", dxy_half_width); - uploadConstant("cv::gpu::surf::c_dxy_scale", dxy_scale); - uploadConstant("cv::gpu::surf::c_initialScale", initialScale); - uploadConstant("cv::gpu::surf::c_threshold", threshold); - - hessianBuffer.create(height0 * nIntervals, width0, CV_32F); + bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img); - integral(img, sum); - sum.convertTo(sumf, CV_32F, 1.0 / 255.0); - - bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf); + integralBuffered(img, sum, intBuffer); + bindTexture("cv::gpu::surf::sumTex", (DevMem2D_)sum); - if (!mask.empty()) - { + if (use_mask) + { min(mask, 1.0, mask1); - integral(mask1, maskSum); - - bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum); - } + integralBuffered(mask1, maskSum, intBuffer); + + bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_)maskSum); + } } ~SURF_GPU_Invoker() { + cudaSafeCall( cudaFree(d_counters) ); + + unbindTexture("cv::gpu::surf::imgTex"); unbindTexture("cv::gpu::surf::sumTex"); if (use_mask) unbindTexture("cv::gpu::surf::maskSumTex"); @@ -172,102 +142,115 @@ namespace void detectKeypoints(GpuMat& keypoints) { - typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads); - const fasthessian_t fasthessian = - DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old; + ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det); + ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace); + + ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer); + ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer); - dim3 threads = calcBlockSize(nIntervals); - for(int octave = 0; octave < nOctaves; ++octave) + for (int octave = 0; octave < nOctaves; ++octave) { - int step = initialStep * (1 << octave); - - // Compute border required such that the filters don't overstep the image boundaries - float d = (initialScale * (1 << octave)) / (nIntervals - 2); - float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f; - int border = static_cast(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f))); - - int x_size = (img_cols - 2 * border) / step; - int y_size = (img_rows - 2 * border) / step; - - if (x_size <= 0 || y_size <= 0) - break; - - uploadConstant("cv::gpu::surf::c_octave", octave); - uploadConstant("cv::gpu::surf::c_x_size", x_size); - uploadConstant("cv::gpu::surf::c_y_size", y_size); - uploadConstant("cv::gpu::surf::c_border", border); - uploadConstant("cv::gpu::surf::c_step", step); - - fasthessian(hessianBuffer, x_size, y_size, threads); - - // Reset the candidate count. - maxCounter = 0; - - nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, x_size, y_size, use_mask, threads); - - maxCounter = std::min(maxCounter, static_cast(max_candidates)); + const int layer_rows = img_rows >> octave; + const int layer_cols = img_cols >> octave; + + uploadConstant("cv::gpu::surf::c_octave", octave); + uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows); + uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols); + + icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers); + + icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr(), d_counters + 2 + octave, + img_rows, img_cols, octave, use_mask, nOctaveLayers); + + unsigned int maxCounter; + cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + maxCounter = std::min(maxCounter, static_cast(maxCandidates)); if (maxCounter > 0) { - fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, - featuresBuffer.ptr(), featureCounter); - - featureCounter = std::min(featureCounter, static_cast(max_features)); + icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr(), maxCounter, + featuresBuffer.ptr(), d_counters); } } + unsigned int featureCounter; + cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + featureCounter = std::min(featureCounter, static_cast(maxFeatures)); - if (featureCounter > 0) - featuresBuffer.colRange(0, featureCounter).copyTo(keypoints); - else - keypoints.release(); + findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints); } - void findOrientation(GpuMat& keypoints) + void findOrientation(const GpuMat& features, GpuMat& keypoints) { - if (keypoints.cols > 0) - find_orientation_gpu(keypoints.ptr(), keypoints.cols); + if (features.cols > 0) + { + ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer); + + icvCalcOrientation_gpu(features.ptr(), features.cols, keypointsBuffer.ptr(), + d_counters + 1); + + unsigned int keypointsCounter; + cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + keypointsCounter = std::min(keypointsCounter, static_cast(maxKeypoints)); + + if (keypointsCounter > 0) + keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints); + else + keypoints.release(); + } } void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize) { - typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors, - const KeyPoint_GPU* features, int nFeatures); - - const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old; - //DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old; - if (keypoints.cols > 0) { descriptors.create(keypoints.cols, descriptorSize, CV_32F); - compute_descriptors(descriptors, keypoints.ptr(), keypoints.cols); + compute_descriptors_gpu(descriptors, keypoints.ptr(), keypoints.cols); } } private: GpuMat& sum; - GpuMat& sumf; - GpuMat& mask1; GpuMat& maskSum; + GpuMat& intBuffer; + + GpuMat& det; + GpuMat& trace; - GpuMat& hessianBuffer; GpuMat& maxPosBuffer; GpuMat& featuresBuffer; + GpuMat& keypointsBuffer; int img_cols, img_rows; bool use_mask; - - float mask_width, mask_height; - unsigned int featureCounter; - unsigned int maxCounter; + int maxCandidates; + int maxFeatures; + int maxKeypoints; - int max_candidates; - int max_features; + unsigned int* d_counters; }; } +cv::gpu::SURF_GPU::SURF_GPU() +{ + hessianThreshold = 100; + extended = 1; + nOctaves = 4; + nOctaveLayers = 2; + keypointsRatio = 0.01f; +} + +cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio) +{ + hessianThreshold = _threshold; + extended = _extended; + nOctaves = _nOctaves; + nOctaveLayers = _nOctaveLayers; + keypointsRatio = _keypointsRatio; +} + int cv::gpu::SURF_GPU::descriptorSize() const { return extended ? 128 : 64; @@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector& keypoints, GpuMa { Mat keypointsCPU(1, keypoints.size(), CV_32FC(6)); - const KeyPoint* keypoints_ptr = &keypoints[0]; - KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr(); - for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr) + for (size_t i = 0; i < keypoints.size(); ++i) { - const KeyPoint& kp = *keypoints_ptr; - KeyPoint_GPU& gkp = *keypointsCPU_ptr; + const KeyPoint& kp = keypoints[i]; + KeyPoint_GPU& gkp = keypointsCPU.ptr()[i]; gkp.x = kp.pt.x; gkp.y = kp.pt.y; + gkp.laplacian = 1.0f; + gkp.size = kp.size; - gkp.octave = static_cast(kp.octave); - gkp.angle = kp.angle; - gkp.response = kp.response; + gkp.dir = kp.angle; + gkp.hessian = kp.response; } keypointsGPU.upload(keypointsCPU); } } +namespace +{ + int calcSize(int octave, int layer) + { + /* Wavelet size at first layer of first octave. */ + const int HAAR_SIZE0 = 9; + + /* Wavelet size increment between layers. This should be an even number, + such that the wavelet sizes in an octave are either all even or all odd. + This ensures that when looking for the neighbours of a sample, the layers + above and below are aligned correctly. */ + const int HAAR_SIZE_INC = 6; + + return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; + } + + int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params) + { + int best_octave = 0; + float min_diff = numeric_limits::max(); + for (int octave = 1; octave < params.nOctaves; ++octave) + { + for (int layer = 0; layer < params.nOctaveLayers; ++layer) + { + float diff = std::abs(kpt.size - (float)calcSize(octave, layer)); + if (min_diff > diff) + { + min_diff = diff; + best_octave = octave; + if (min_diff == 0) + return best_octave; + } + } + } + return best_octave; + } +} + void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints) { if (keypointsGPU.empty()) @@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector(); - for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr) + for (int i = 0; i < keypointsGPU.cols; ++i) { - KeyPoint& kp = *keypoints_ptr; - const KeyPoint_GPU& gkp = *keypointsCPU_ptr; + KeyPoint& kp = keypoints[i]; + const KeyPoint_GPU& gkp = keypointsCPU.ptr()[i]; kp.pt.x = gkp.x; kp.pt.y = gkp.y; kp.size = gkp.size; - kp.octave = static_cast(gkp.octave); - kp.angle = gkp.angle; - kp.response = gkp.response; + kp.angle = gkp.dir; + + kp.response = gkp.hessian; + + kp.octave = getPointOctave(gkp, *this); + + kp.class_id = static_cast(gkp.laplacian); } } } @@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat SURF_GPU_Invoker surf(*this, img, mask); surf.detectKeypoints(keypoints); - - surf.findOrientation(keypoints); } } void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints, bool calcOrientation) + bool useProvidedKeypoints) { if (!img.empty()) { SURF_GPU_Invoker surf(*this, img, mask); - + if (!useProvidedKeypoints) surf.detectKeypoints(keypoints); - - if (calcOrientation) - surf.findOrientation(keypoints); + else + { + GpuMat keypointsBuf; + surf.findOrientation(keypoints, keypointsBuf); + keypointsBuf.copyTo(keypoints); + } surf.computeDescriptors(keypoints, descriptors, descriptorSize()); } @@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector } void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector& keypoints, - GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation) + GpuMat& descriptors, bool useProvidedKeypoints) { GpuMat keypointsGPU; if (useProvidedKeypoints) uploadKeypoints(keypoints, keypointsGPU); - (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation); + (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints); downloadKeypoints(keypointsGPU, keypoints); } void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector& keypoints, - vector& descriptors, bool useProvidedKeypoints, bool calcOrientation) + vector& descriptors, bool useProvidedKeypoints) { GpuMat descriptorsGPU; - (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation); + (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints); downloadDescriptors(descriptorsGPU, descriptors); } diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp index c0f254e..12f7196 100644 --- a/modules/gpu/test/test_features2d.cpp +++ b/modules/gpu/test/test_features2d.cpp @@ -48,7 +48,6 @@ using namespace std; const string FEATURES2D_DIR = "features2d"; const string IMAGE_FILENAME = "aloe.png"; -const string VALID_FILE_NAME = "surf.xml.gz"; class CV_GPU_SURFTest : public cvtest::BaseTest { @@ -59,17 +58,20 @@ public: protected: bool isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2); + int getValidCount(const vector& keypoints1, const vector& keypoints2, const vector& matches); void compareKeypointSets(const vector& validKeypoints, const vector& calcKeypoints, const Mat& validDescriptors, const Mat& calcDescriptors); - void emptyDataTest(SURF_GPU& fdetector); - void regressionTest(SURF_GPU& fdetector); + void emptyDataTest(); + void accuracyTest(); virtual void run(int); }; -void CV_GPU_SURFTest::emptyDataTest(SURF_GPU& fdetector) +void CV_GPU_SURFTest::emptyDataTest() { + SURF_GPU fdetector; + GpuMat image; vector keypoints; vector descriptors; @@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2) p1.class_id == p2.class_id ); } -void CV_GPU_SURFTest::compareKeypointSets(const vector& validKeypoints, const vector& calcKeypoints, - const Mat& validDescriptors, const Mat& calcDescriptors) +int CV_GPU_SURFTest::getValidCount(const vector& keypoints1, const vector& keypoints2, + const vector& matches) { - if (validKeypoints.size() != calcKeypoints.size()) + int count = 0; + + for (size_t i = 0; i < matches.size(); ++i) { - ts->printf(cvtest::TS::LOG, "Keypoints sizes doesn't equal (validCount = %d, calcCount = %d).\n", - validKeypoints.size(), calcKeypoints.size()); - ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT); - return; + const DMatch& m = matches[i]; + + const KeyPoint& kp1 = keypoints1[m.queryIdx]; + const KeyPoint& kp2 = keypoints2[m.trainIdx]; + + if (isSimilarKeypoints(kp1, kp2)) + ++count; } - if (validDescriptors.size() != calcDescriptors.size()) + + return count; +} + +void CV_GPU_SURFTest::compareKeypointSets(const vector& validKeypoints, const vector& calcKeypoints, + const Mat& validDescriptors, const Mat& calcDescriptors) +{ + BruteForceMatcher< L2 > matcher; + vector matches; + + matcher.match(validDescriptors, calcDescriptors, matches); + + int validCount = getValidCount(validKeypoints, calcKeypoints, matches); + float validRatio = (float)validCount / matches.size(); + + if (validRatio < 0.5f) { - ts->printf(cvtest::TS::LOG, "Descriptors sizes doesn't equal.\n"); - ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT); + ts->printf(cvtest::TS::LOG, "Bad accuracy - %f.\n", validRatio); + ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY ); return; } - for (size_t v = 0; v < validKeypoints.size(); v++) - { - int nearestIdx = -1; - float minDist = std::numeric_limits::max(); - - for (size_t c = 0; c < calcKeypoints.size(); c++) - { - float curDist = (float)norm(calcKeypoints[c].pt - validKeypoints[v].pt); - if (curDist < minDist) - { - minDist = curDist; - nearestIdx = c; - } - } - - assert(minDist >= 0); - if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx])) - { - ts->printf(cvtest::TS::LOG, "Bad keypoints accuracy.\n"); - ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY ); - return; - } - - if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f) - { - ts->printf(cvtest::TS::LOG, "Bad descriptors accuracy.\n"); - ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY ); - return; - } - } } -void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector) +void CV_GPU_SURFTest::accuracyTest() { string imgFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME; - string resFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + VALID_FILE_NAME; // Read the test image. - GpuMat image(imread(imgFilename, 0)); + Mat image = imread(imgFilename, 0); if (image.empty()) { ts->printf( cvtest::TS::LOG, "Image %s can not be read.\n", imgFilename.c_str() ); ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA ); return; } - - FileStorage fs(resFilename, FileStorage::READ); + + Mat mask(image.size(), CV_8UC1, Scalar::all(1)); + mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0)); // Compute keypoints. - GpuMat mask(image.size(), CV_8UC1, Scalar::all(1)); - mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0)); vector calcKeypoints; - GpuMat calcDespcriptors; - fdetector(image, mask, calcKeypoints, calcDespcriptors); - - if (fs.isOpened()) // Compare computed and valid keypoints. - { - // Read validation keypoints set. - vector validKeypoints; - Mat validDespcriptors; - read(fs["keypoints"], validKeypoints); - read(fs["descriptors"], validDespcriptors); - if (validKeypoints.empty() || validDespcriptors.empty()) - { - ts->printf(cvtest::TS::LOG, "Validation file can not be read.\n"); - ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA); - return; - } - - compareKeypointSets(validKeypoints, calcKeypoints, validDespcriptors, calcDespcriptors); - } - else // Write detector parameters and computed keypoints as validation data. - { - fs.open(resFilename, FileStorage::WRITE); - if (!fs.isOpened()) - { - ts->printf(cvtest::TS::LOG, "File %s can not be opened to write.\n", resFilename.c_str()); - ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA); - return; - } - else - { - write(fs, "keypoints", calcKeypoints); - write(fs, "descriptors", (Mat)calcDespcriptors); - } - } + GpuMat calcDescriptors; + SURF_GPU fdetector; fdetector.extended = false; + fdetector(GpuMat(image), GpuMat(mask), calcKeypoints, calcDescriptors); + + // Calc validation keypoints set. + vector validKeypoints; + vector validDescriptors; + SURF fdetector_gold; fdetector_gold.extended = false; + fdetector_gold(image, mask, validKeypoints, validDescriptors); + + compareKeypointSets(validKeypoints, calcKeypoints, + Mat(validKeypoints.size(), fdetector_gold.descriptorSize(), CV_32F, &validDescriptors[0]), calcDescriptors); } void CV_GPU_SURFTest::run( int /*start_from*/ ) { - SURF_GPU fdetector; - - emptyDataTest(fdetector); - regressionTest(fdetector); + emptyDataTest(); + accuracyTest(); } -TEST(SURF, empty_data_and_regression) { CV_GPU_SURFTest test; test.safe_run(); } +TEST(SURF, empty_data_and_accuracy) { CV_GPU_SURFTest test; test.safe_run(); } diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 34af5ae..18b45f2 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -264,10 +264,11 @@ TEST(SURF) SURF surf; vector keypoints1, keypoints2; + vector descriptors1, descriptors2; CPU_ON; - surf(src1, Mat(), keypoints1); - surf(src2, Mat(), keypoints2); + surf(src1, Mat(), keypoints1, descriptors1); + surf(src2, Mat(), keypoints2, descriptors2); CPU_OFF; gpu::SURF_GPU d_surf; @@ -275,8 +276,8 @@ TEST(SURF) gpu::GpuMat d_descriptors1, d_descriptors2; GPU_ON; - d_surf(d_src1, gpu::GpuMat(), d_keypoints1); - d_surf(d_src2, gpu::GpuMat(), d_keypoints2); + d_surf(d_src1, gpu::GpuMat(), d_keypoints1, d_descriptors1); + d_surf(d_src2, gpu::GpuMat(), d_keypoints2, d_descriptors2); GPU_OFF; } diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp index b2c9385..eb8da5f 100644 --- a/samples/gpu/surf_keypoint_matcher.cpp +++ b/samples/gpu/surf_keypoint_matcher.cpp @@ -51,10 +51,10 @@ int main(int argc, char* argv[]) vector keypoints1, keypoints2; vector descriptors1, descriptors2; vector matches; - SURF_GPU::downloadKeypoints(keypoints1GPU, keypoints1); - SURF_GPU::downloadKeypoints(keypoints2GPU, keypoints2); - SURF_GPU::downloadDescriptors(descriptors1GPU, descriptors1); - SURF_GPU::downloadDescriptors(descriptors2GPU, descriptors2); + surf.downloadKeypoints(keypoints1GPU, keypoints1); + surf.downloadKeypoints(keypoints2GPU, keypoints2); + surf.downloadDescriptors(descriptors1GPU, descriptors1); + surf.downloadDescriptors(descriptors2GPU, descriptors2); BruteForceMatcher_GPU< L2 >::matchDownload(trainIdx, distance, matches); // drawing the results -- 2.7.4