From 0b19f915bedf758bbfe06faa2a2ac1aeffeeff06 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 23 May 2011 07:59:20 +0000 Subject: [PATCH] minor optimization of SURF_GPU (reduce memory transfers, use structure of arrays instead of array of structures) --- .../gpu/doc/feature_detection_and_description.rst | 20 +-- modules/gpu/include/opencv2/gpu/gpu.hpp | 23 +++- modules/gpu/src/cuda/internal_shared.hpp | 23 +--- modules/gpu/src/cuda/surf.cu | 142 +++++++++---------- modules/gpu/src/surf.cpp | 151 ++++++++++----------- 5 files changed, 168 insertions(+), 191 deletions(-) diff --git a/modules/gpu/doc/feature_detection_and_description.rst b/modules/gpu/doc/feature_detection_and_description.rst index c043987..192f17f 100644 --- a/modules/gpu/doc/feature_detection_and_description.rst +++ b/modules/gpu/doc/feature_detection_and_description.rst @@ -15,6 +15,17 @@ This class is used for extracting Speeded Up Robust Features (SURF) from an imag class SURF_GPU : public CvSURFParams { public: + enum KeypointLayout + { + SF_X = 0, + SF_Y, + SF_LAPLACIAN, + SF_SIZE, + SF_DIR, + SF_HESSIAN, + SF_FEATURE_STRIDE + }; + //! the default constructor SURF_GPU(); //! the full constructor taking all the necessary parameters @@ -67,22 +78,15 @@ This class is used for extracting Speeded Up Robust Features (SURF) from an imag GpuMat det, trace; GpuMat maxPosBuffer; - GpuMat featuresBuffer; - GpuMat keypointsBuffer; }; The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8 bit grayscale images are supported. -The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat`` . The ``keypoints`` matrix is a one-row matrix of the ``CV_32FC6`` type. It contains 6 float values per feature: ``x, y, laplacian, size, dir, hessian`` . The ``descriptors`` matrix is -:math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type. +The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat`` . The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 6` matrix with the ``CV_32FC1`` type. keypoints.ptr(SF_X)[i] will contain x coordinate of i'th feature. keypoints.ptr(SF_Y)[i] will contain y coordinate of i'th feature. keypoints.ptr(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature. keypoints.ptr(SF_SIZE)[i] will contain size of i'th feature. keypoints.ptr(SF_DIR)[i] will contain orientation of i'th feature. keypoints.ptr(SF_HESSIAN)[i] will contain response of i'th feature. The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type. The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls. -**Note:** - -By default for user provided keypoints the class ``SURF_GPU`` recalculates keypoint's orientation and returns reodered/filtered keypoints array and coresponding decriptors array. - See Also: :c:type:`SURF` .. index:: gpu::BruteForceMatcher_GPU diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 087627e..cbb0f37 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1566,6 +1566,17 @@ namespace cv class CV_EXPORTS SURF_GPU : public CvSURFParams { public: + enum KeypointLayout + { + SF_X = 0, + SF_Y, + SF_LAPLACIAN, + SF_SIZE, + SF_DIR, + SF_HESSIAN, + SF_FEATURE_STRIDE + }; + //! the default constructor SURF_GPU(); //! the full constructor taking all the necessary parameters @@ -1585,9 +1596,13 @@ namespace cv //! finds the keypoints using fast hessian detector used in SURF //! supports CV_8UC1 images - //! keypoints will have 1 row and type CV_32FC(6) - //! keypoints.at(1, i) contains i'th keypoint - //! format: (x, y, laplacian, size, dir, hessian) + //! keypoints will have nFeature cols and 6 rows + //! keypoints.ptr(SF_X)[i] will contain x coordinate of i'th feature + //! keypoints.ptr(SF_Y)[i] will contain y coordinate of i'th feature + //! keypoints.ptr(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature + //! keypoints.ptr(SF_SIZE)[i] will contain size of i'th feature + //! keypoints.ptr(SF_DIR)[i] will contain orientation of i'th feature + //! keypoints.ptr(SF_HESSIAN)[i] will contain response of i'th feature void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); //! finds the keypoints and computes their descriptors. //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction @@ -1611,8 +1626,6 @@ namespace cv GpuMat det, trace; GpuMat maxPosBuffer; - GpuMat featuresBuffer; - GpuMat keypointsBuffer; }; } diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp index d2428c6..8c49563 100644 --- a/modules/gpu/src/cuda/internal_shared.hpp +++ b/modules/gpu/src/cuda/internal_shared.hpp @@ -105,28 +105,7 @@ namespace cv const textureReference* tex; cudaSafeCall( cudaGetTextureReference(&tex, name) ); cudaSafeCall( cudaUnbindTexture(tex) ); - } - - struct KeyPoint_GPU - { - float x; - float y; - float laplacian; - float size; - float dir; - float hessian; - }; - - enum KeypointLayout - { - SF_X, - SF_Y, - SF_LAPLACIAN, - SF_SIZE, - SF_DIR, - SF_HESSIAN, - SF_FEATURE_STRIDE - }; + } } } diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index cd61175..834bf43 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -63,8 +63,6 @@ namespace cv { namespace gpu { namespace surf __constant__ int c_max_candidates; // The maximum number of features that memory is reserved for. __constant__ int c_max_features; - // The maximum number of keypoints that memory is reserved for. - __constant__ int c_max_keypoints; // The image size. __constant__ int c_img_rows; __constant__ int c_img_cols; @@ -346,7 +344,9 @@ namespace cv { namespace gpu { namespace surf //////////////////////////////////////////////////////////////////////// // INTERPOLATION - __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter) + __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, + float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, + unsigned int* featureCounter) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 @@ -357,7 +357,6 @@ namespace cv { namespace gpu { namespace surf const int layer = maxPos.z - 1 + threadIdx.z; __shared__ float N9[3][3][3]; - __shared__ KeyPoint_GPU p; N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j]; __syncthreads(); @@ -422,33 +421,46 @@ namespace cv { namespace gpu { namespace surf if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f) { // if the step is within the interpolation region, perform it + + const int size = calcSize(c_octave, maxPos.z); - // Get a new feature index. - unsigned int ind = atomicInc(featureCounter, (unsigned int)-1); + const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave; + const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave; + + const float center_i = sum_i + (float)(size - 1) / 2; + const float center_j = sum_j + (float)(size - 1) / 2; - if (ind < c_max_features) - { - const int size = calcSize(c_octave, maxPos.z); + const float px = center_j + x[0] * (1 << c_octave); + const float py = center_i + x[1] * (1 << c_octave); - const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave; - const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave; - - const float center_i = sum_i + (float)(size - 1) / 2; - const float center_j = sum_j + (float)(size - 1) / 2; + const int ds = size - calcSize(c_octave, maxPos.z - 1); + const float psize = roundf(size + x[2] * ds); - p.x = center_j + x[0] * (1 << c_octave); - p.y = center_i + x[1] * (1 << c_octave); + /* The sampling intervals and wavelet sized for selecting an orientation + and building the keypoint descriptor are defined relative to 's' */ + const float s = psize * 1.2f / 9.0f; - int ds = size - calcSize(c_octave, maxPos.z - 1); - p.size = roundf(size + x[2] * ds); + /* To find the dominant orientation, the gradients in x and y are + sampled in a circle of radius 6s using wavelets of size 4s. + We ensure the gradient wavelet size is even to ensure the + wavelet pattern is balanced and symmetric around its center */ + const int grad_wav_size = 2 * __float2int_rn(2.0f * s); - p.laplacian = maxPos.w; - p.dir = 0.0f; - p.hessian = N9[1][1][1]; + // check when grad_wav_size is too big + if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) + { + // Get a new feature index. + unsigned int ind = atomicInc(featureCounter, (unsigned int)-1); - // Should we split up this transfer over many threads? - featuresBuffer[ind] = p; - } + if (ind < c_max_features) + { + featureX[ind] = px; + featureY[ind] = py; + featureLaplacian[ind] = maxPos.w; + featureSize[ind] = psize; + featureHessian[ind] = N9[1][1][1]; + } + } // grad_wav_size check } // If the subpixel interpolation worked } } // If this is thread 0. @@ -456,7 +468,9 @@ namespace cv { namespace gpu { namespace surf #endif } - void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter) + void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, + float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, + unsigned int* featureCounter) { dim3 threads; threads.x = 3; @@ -466,7 +480,7 @@ namespace cv { namespace gpu { namespace surf dim3 grid; grid.x = maxCounter; - icvInterpolateKeypoint<<>>(det, maxPosBuffer, featuresBuffer, featureCounter); + icvInterpolateKeypoint<<>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -486,7 +500,7 @@ namespace cv { namespace gpu { namespace surf __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; - __global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter) + __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 @@ -497,16 +511,9 @@ namespace cv { namespace gpu { namespace surf __shared__ float s_sumx[64 * 4]; __shared__ float s_sumy[64 * 4]; - __shared__ float s_feature[6]; - - if (threadIdx.x < 6 && threadIdx.y == 0) - s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x]; - __syncthreads(); - - /* The sampling intervals and wavelet sized for selecting an orientation and building the keypoint descriptor are defined relative to 's' */ - const float s = s_feature[SF_SIZE] * 1.2f / 9.0f; + const float s = featureSize[blockIdx.x] * 1.2f / 9.0f; /* To find the dominant orientation, the gradients in x and y are sampled in a circle of radius 6s using wavelets of size 4s. @@ -526,8 +533,8 @@ namespace cv { namespace gpu { namespace surf if (tid < ORI_SAMPLES) { const float margin = (float)(grad_wav_size - 1) / 2.0f; - const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin); - const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin); + const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin); + const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin); if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size)) { @@ -646,26 +653,12 @@ namespace cv { namespace gpu { namespace surf if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0) { - // Get a new feature index. - unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1); + float kp_dir = atan2f(besty, bestx); + if (kp_dir < 0) + kp_dir += 2.0f * CV_PI; + kp_dir *= 180.0f / CV_PI; - if (ind < c_max_keypoints) - { - float kp_dir = atan2f(besty, bestx); - if (kp_dir < 0) - kp_dir += 2.0f * CV_PI; - kp_dir *= 180.0f / CV_PI; - __shared__ KeyPoint_GPU kp; - - kp.x = s_feature[SF_X]; - kp.y = s_feature[SF_Y]; - kp.laplacian = s_feature[SF_LAPLACIAN]; - kp.size = s_feature[SF_SIZE]; - kp.dir = kp_dir; - kp.hessian = s_feature[SF_HESSIAN]; - - keypoints[ind] = kp; - } + featureDir[blockIdx.x] = kp_dir; } } @@ -676,7 +669,7 @@ namespace cv { namespace gpu { namespace surf #undef ORI_WIN #undef ORI_SAMPLES - void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter) + void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) { dim3 threads; threads.x = 64; @@ -685,7 +678,7 @@ namespace cv { namespace gpu { namespace surf dim3 grid; grid.x = nFeatures; - icvCalcOrientation<<>>(featureBuffer, keypoints, keypointCounter); + icvCalcOrientation<<>>(featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -748,20 +741,16 @@ namespace cv { namespace gpu { namespace surf return saturate_cast(res); } - __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid) + __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], + const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, + int tid) { __shared__ float s_PATCH[6][6]; - // get the interest point parameters (x, y, size, response, angle) - __shared__ float s_pt[5]; - if (threadIdx.y == 0) - s_pt[threadIdx.x] = ((float*)(&keypoints[blockIdx.x]))[threadIdx.x]; - __syncthreads(); - - const float centerX = s_pt[SF_X]; - const float centerY = s_pt[SF_Y]; - const float size = s_pt[SF_SIZE]; - const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180); + const float centerX = featureX[blockIdx.x]; + const float centerY = featureY[blockIdx.x]; + const float size = featureSize[blockIdx.x]; + const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180); /* The sampling intervals and wavelet sized for selecting an orientation and building the keypoint descriptor are defined relative to 's' */ @@ -838,7 +827,7 @@ namespace cv { namespace gpu { namespace surf } } - __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) + __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) { // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) __shared__ float sdx[25]; @@ -848,7 +837,7 @@ namespace cv { namespace gpu { namespace surf const int tid = threadIdx.y * blockDim.x + threadIdx.x; - calc_dx_dy(sdx, sdy, features, tid); + calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid); __syncthreads(); sdxabs[tid] = fabs(sdx[tid]); // |dx| array @@ -870,7 +859,7 @@ namespace cv { namespace gpu { namespace surf } } - __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) + __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) { // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) __shared__ float sdx[25]; @@ -884,7 +873,7 @@ namespace cv { namespace gpu { namespace surf const int tid = threadIdx.y * blockDim.x + threadIdx.x; - calc_dx_dy(sdx, sdy, features, tid); + calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid); __syncthreads(); if (sdy[tid] >= 0) @@ -990,13 +979,14 @@ namespace cv { namespace gpu { namespace surf descriptor_base[threadIdx.x] = lookup / len; } - void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) + void compute_descriptors_gpu(const DevMem2Df& descriptors, + const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures) { // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D if (descriptors.cols == 64) { - compute_descriptors64<<>>(descriptors, features); + compute_descriptors64<<>>(descriptors, featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -1008,7 +998,7 @@ namespace cv { namespace gpu { namespace surf } else { - compute_descriptors128<<>>(descriptors, features); + compute_descriptors128<<>>(descriptors, featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 49fde58..2ec6b29 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -69,11 +69,14 @@ namespace cv { namespace gpu { namespace surf void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, int img_rows, int img_cols, int octave, bool use_mask, int nLayers); - void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter); + void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, + float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, + unsigned int* featureCounter); - void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter); + void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures); - void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures); + void compute_descriptors_gpu(const DevMem2Df& descriptors, + const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures); }}} using namespace cv::gpu::surf; @@ -88,7 +91,7 @@ namespace sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace), - maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer), + maxPosBuffer(surf.maxPosBuffer), img_cols(img.cols), img_rows(img.rows), @@ -101,18 +104,16 @@ namespace CV_Assert(nOctaves > 0 && nOctaveLayers > 0); CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)); - maxKeypoints = min(static_cast(img.size().area() * surf.keypointsRatio), 65535); - maxFeatures = min(static_cast(1.5 * maxKeypoints), 65535); + maxFeatures = min(static_cast(img.size().area() * surf.keypointsRatio), 65535); maxCandidates = min(static_cast(1.5 * maxFeatures), 65535); - CV_Assert(maxKeypoints > 0); + CV_Assert(maxFeatures > 0); - cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) ); - cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) ); + cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 1) * sizeof(unsigned int)) ); + cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 1) * sizeof(unsigned int)) ); uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates); uploadConstant("cv::gpu::surf::c_max_features", maxFeatures); - uploadConstant("cv::gpu::surf::c_max_keypoints", maxKeypoints); uploadConstant("cv::gpu::surf::c_img_rows", img_rows); uploadConstant("cv::gpu::surf::c_img_cols", img_cols); uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers); @@ -148,7 +149,8 @@ namespace ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace); ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer); - ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer); + ensureSizeIsEnough(SURF_GPU::SF_FEATURE_STRIDE, maxFeatures, CV_32FC1, keypoints); + keypoints.setTo(Scalar::all(0)); for (int octave = 0; octave < nOctaves; ++octave) { @@ -161,60 +163,49 @@ namespace icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers); - icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr(), d_counters + 2 + octave, + icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr(), d_counters + 1 + octave, img_rows, img_cols, octave, use_mask, nOctaveLayers); unsigned int maxCounter; - cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); maxCounter = std::min(maxCounter, static_cast(maxCandidates)); if (maxCounter > 0) { icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr(), maxCounter, - featuresBuffer.ptr(), d_counters); + keypoints.ptr(SURF_GPU::SF_X), keypoints.ptr(SURF_GPU::SF_Y), + keypoints.ptr(SURF_GPU::SF_LAPLACIAN), keypoints.ptr(SURF_GPU::SF_SIZE), + keypoints.ptr(SURF_GPU::SF_HESSIAN), d_counters); } } unsigned int featureCounter; cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); featureCounter = std::min(featureCounter, static_cast(maxFeatures)); + keypoints.cols = featureCounter; + if (!upright) - findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints); - else - { - if (featureCounter > 0) - featuresBuffer.colRange(0, featureCounter).copyTo(keypoints); - else - keypoints.release(); - } + findOrientation(keypoints); } - void findOrientation(const GpuMat& features, GpuMat& keypoints) + void findOrientation(GpuMat& keypoints) { - if (features.cols > 0) + const int nFeatures = keypoints.cols; + if (nFeatures > 0) { - ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer); - - icvCalcOrientation_gpu(features.ptr(), features.cols, keypointsBuffer.ptr(), - d_counters + 1); - - unsigned int keypointsCounter; - cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - keypointsCounter = std::min(keypointsCounter, static_cast(maxKeypoints)); - - if (keypointsCounter > 0) - keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints); - else - keypoints.release(); + icvCalcOrientation_gpu(keypoints.ptr(SURF_GPU::SF_X), keypoints.ptr(SURF_GPU::SF_Y), + keypoints.ptr(SURF_GPU::SF_SIZE), keypoints.ptr(SURF_GPU::SF_DIR), nFeatures); } } void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize) { - if (keypoints.cols > 0) + const int nFeatures = keypoints.cols; + if (nFeatures > 0) { - descriptors.create(keypoints.cols, descriptorSize, CV_32F); - compute_descriptors_gpu(descriptors, keypoints.ptr(), keypoints.cols); + descriptors.create(nFeatures, descriptorSize, CV_32F); + compute_descriptors_gpu(descriptors, keypoints.ptr(SURF_GPU::SF_X), keypoints.ptr(SURF_GPU::SF_Y), + keypoints.ptr(SURF_GPU::SF_SIZE), keypoints.ptr(SURF_GPU::SF_DIR), nFeatures); } } @@ -228,8 +219,6 @@ namespace GpuMat& trace; GpuMat& maxPosBuffer; - GpuMat& featuresBuffer; - GpuMat& keypointsBuffer; int img_cols, img_rows; @@ -239,7 +228,6 @@ namespace int maxCandidates; int maxFeatures; - int maxKeypoints; unsigned int* d_counters; }; @@ -276,22 +264,24 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector& keypoints, GpuMa keypointsGPU.release(); else { - Mat keypointsCPU(1, keypoints.size(), CV_32FC(6)); + Mat keypointsCPU(SURF_GPU::SF_FEATURE_STRIDE, keypoints.size(), CV_32FC1); + + float* kp_x = keypointsCPU.ptr(SURF_GPU::SF_X); + float* kp_y = keypointsCPU.ptr(SURF_GPU::SF_Y); + int* kp_laplacian = keypointsCPU.ptr(SURF_GPU::SF_LAPLACIAN); + float* kp_size = keypointsCPU.ptr(SURF_GPU::SF_SIZE); + float* kp_dir = keypointsCPU.ptr(SURF_GPU::SF_DIR); + float* kp_hessian = keypointsCPU.ptr(SURF_GPU::SF_HESSIAN); - for (size_t i = 0; i < keypoints.size(); ++i) + for (size_t i = 0, size = keypoints.size(); i < size; ++i) { const KeyPoint& kp = keypoints[i]; - KeyPoint_GPU& gkp = keypointsCPU.ptr()[i]; - - gkp.x = kp.pt.x; - gkp.y = kp.pt.y; - - gkp.laplacian = 1.0f; - - gkp.size = kp.size; - - gkp.dir = kp.angle; - gkp.hessian = kp.response; + kp_x[i] = kp.pt.x; + kp_y[i] = kp.pt.y; + kp_size[i] = kp.size; + kp_dir[i] = kp.angle; + kp_hessian[i] = kp.response; + kp_laplacian[i] = 1; } keypointsGPU.upload(keypointsCPU); @@ -314,7 +304,7 @@ namespace return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } - int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params) + int getPointOctave(float size, const CvSURFParams& params) { int best_octave = 0; float min_diff = numeric_limits::max(); @@ -322,7 +312,7 @@ namespace { for (int layer = 0; layer < params.nOctaveLayers; ++layer) { - float diff = std::abs(kpt.size - (float)calcSize(octave, layer)); + float diff = std::abs(size - (float)calcSize(octave, layer)); if (min_diff > diff) { min_diff = diff; @@ -338,32 +328,35 @@ namespace void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints) { - if (keypointsGPU.empty()) + const int nFeatures = keypointsGPU.cols; + + if (nFeatures == 0) keypoints.clear(); else { - CV_Assert(keypointsGPU.type() == CV_32FC(6) && keypointsGPU.isContinuous()); - + CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE); + Mat keypointsCPU = keypointsGPU; - keypoints.resize(keypointsGPU.cols); + + keypoints.resize(nFeatures); - for (int i = 0; i < keypointsGPU.cols; ++i) + float* kp_x = keypointsCPU.ptr(SF_X); + float* kp_y = keypointsCPU.ptr(SF_Y); + int* kp_laplacian = keypointsCPU.ptr(SF_LAPLACIAN); + float* kp_size = keypointsCPU.ptr(SF_SIZE); + float* kp_dir = keypointsCPU.ptr(SF_DIR); + float* kp_hessian = keypointsCPU.ptr(SF_HESSIAN); + + for (int i = 0; i < nFeatures; ++i) { KeyPoint& kp = keypoints[i]; - const KeyPoint_GPU& gkp = keypointsCPU.ptr()[i]; - - kp.pt.x = gkp.x; - kp.pt.y = gkp.y; - - kp.size = gkp.size; - - kp.angle = gkp.dir; - - kp.response = gkp.hessian; - - kp.octave = getPointOctave(gkp, *this); - - kp.class_id = static_cast(gkp.laplacian); + kp.pt.x = kp_x[i]; + kp.pt.y = kp_y[i]; + kp.class_id = kp_laplacian[i]; + kp.size = kp_size[i]; + kp.angle = kp_dir[i]; + kp.response = kp_hessian[i]; + kp.octave = getPointOctave(kp.size, *this); } } } @@ -403,9 +396,7 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat surf.detectKeypoints(keypoints); else if (!upright) { - GpuMat keypointsBuf; - surf.findOrientation(keypoints, keypointsBuf); - keypointsBuf.copyTo(keypoints); + surf.findOrientation(keypoints); } surf.computeDescriptors(keypoints, descriptors, descriptorSize()); -- 2.7.4