made GPU version of SURF more consistent with CPU one
authorVladislav Vinogradov <no@email>
Thu, 10 Mar 2011 13:53:58 +0000 (13:53 +0000)
committerVladislav Vinogradov <no@email>
Thu, 10 Mar 2011 13:53:58 +0000 (13:53 +0000)
modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/src/cuda/internal_shared.hpp
modules/gpu/src/cuda/surf.cu
modules/gpu/src/surf.cpp
modules/gpu/test/test_features2d.cpp
samples/gpu/performance/tests.cpp
samples/gpu/surf_keypoint_matcher.cpp

index 33fa3b4..854b89d 100644 (file)
@@ -1537,83 +1537,55 @@ namespace cv
         };\r
         \r
         ////////////////////////////////// SURF //////////////////////////////////////////\r
-        \r
-        struct CV_EXPORTS SURFParams_GPU \r
-        {\r
-            SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), \r
-                l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),\r
-                edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}\r
-\r
-            //! The interest operator threshold\r
-            float threshold;\r
-            //! The number of octaves to process\r
-            int nOctaves;\r
-            //! The number of intervals in each octave\r
-            int nIntervals;\r
-            //! The scale associated with the first interval of the first octave\r
-            float initialScale;\r
-\r
-            //! mask parameter l_1\r
-            float l1;\r
-            //! mask parameter l_2 \r
-            float l2;\r
-            //! mask parameter l_3\r
-            float l3;\r
-            //! mask parameter l_4\r
-            float l4;\r
-            //! The amount to scale the edge rejection mask\r
-            float edgeScale;\r
-            //! The initial sampling step in pixels.\r
-            int initialStep;\r
-\r
-            //! True, if generate 128-len descriptors, false - 64-len descriptors\r
-            bool extended;\r
-\r
-            //! max features = featuresRatio * img.size().srea()\r
-            float featuresRatio;\r
-        };\r
 \r
-        class CV_EXPORTS SURF_GPU : public SURFParams_GPU\r
+        class CV_EXPORTS SURF_GPU : public CvSURFParams\r
         {\r
         public:\r
+            //! the default constructor\r
+            SURF_GPU();\r
+            //! the full constructor taking all the necessary parameters\r
+            explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,\r
+                 int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);\r
+\r
             //! returns the descriptor size in float's (64 or 128)\r
             int descriptorSize() const;\r
 \r
             //! upload host keypoints to device memory\r
-            static void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);\r
+            void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);\r
             //! download keypoints from device to host memory\r
-            static void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);\r
+            void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);\r
 \r
             //! download descriptors from device to host memory\r
-            static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);\r
+            void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);\r
             \r
             //! finds the keypoints using fast hessian detector used in SURF\r
             //! supports CV_8UC1 images\r
             //! keypoints will have 1 row and type CV_32FC(6)\r
             //! keypoints.at<float[6]>(1, i) contains i'th keypoint\r
-            //! format: (x, y, size, response, angle, octave)\r
+            //! format: (x, y, laplacian, size, dir, hessian)\r
             void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);\r
             //! finds the keypoints and computes their descriptors. \r
             //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction\r
             void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, \r
-                bool useProvidedKeypoints = false, bool calcOrientation = true);\r
+                bool useProvidedKeypoints = false);\r
         \r
             void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);\r
             void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, \r
-                bool useProvidedKeypoints = false, bool calcOrientation = true);\r
+                bool useProvidedKeypoints = false);\r
             \r
             void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, \r
-                bool useProvidedKeypoints = false, bool calcOrientation = true);\r
+                bool useProvidedKeypoints = false);\r
+\r
+            //! max keypoints = keypointsRatio * img.size().area()\r
+            float keypointsRatio;\r
 \r
-            GpuMat sum;\r
-            GpuMat sumf;\r
+            GpuMat sum, mask1, maskSum, intBuffer;\r
 \r
-            GpuMat mask1;\r
-            GpuMat maskSum;\r
+            GpuMat det, trace;\r
 \r
-            GpuMat hessianBuffer;\r
             GpuMat maxPosBuffer;\r
             GpuMat featuresBuffer;\r
+            GpuMat keypointsBuffer;\r
         };\r
 \r
     }\r
index e82700e..d2428c6 100644 (file)
@@ -111,20 +111,20 @@ namespace cv
         {\r
             float x;\r
             float y;\r
+            float laplacian;\r
             float size;\r
-            float response;\r
-            float angle;\r
-            float octave;\r
+            float dir;\r
+            float hessian;\r
         };\r
 \r
         enum KeypointLayout \r
         {\r
             SF_X,\r
             SF_Y,\r
+            SF_LAPLACIAN,\r
             SF_SIZE,\r
-            SF_RESPONSE,\r
-            SF_ANGLE,\r
-            SF_OCTAVE,\r
+            SF_DIR,\r
+            SF_HESSIAN,\r
             SF_FEATURE_STRIDE\r
         };\r
     }\r
index d8abf1b..bd2a1d9 100644 (file)
@@ -56,267 +56,132 @@ using namespace cv::gpu::device;
 namespace cv { namespace gpu { namespace surf\r
 {\r
     ////////////////////////////////////////////////////////////////////////\r
-    // Help funcs\r
-\r
-    // Wrapper for host reference to pass into kernel\r
-    template <typename T> \r
-    class DeviceReference\r
-    {\r
-    public:\r
-        explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val)\r
-        {\r
-            cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) );\r
-            cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) );\r
-        }\r
-\r
-        ~DeviceReference()\r
-        {            \r
-            cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) );\r
-            cudaSafeCall( cudaFree(d_ptr) );\r
-        }\r
-\r
-        // Casting to device pointer\r
-        operator T*() {return d_ptr;}\r
-        operator const T*() const {return d_ptr;}\r
-    private:\r
-        T* d_ptr;\r
-        T* h_ptr;\r
-    };\r
-\r
-    __device__ void clearLastBit(int* f)\r
-    {\r
-        *f &= ~0x1;\r
-    }\r
-    __device__ void clearLastBit(float& f)\r
-    {\r
-        clearLastBit((int*)&f);\r
-    }\r
-\r
-    __device__ void setLastBit(int* f)\r
-    {\r
-        *f |= 0x1;\r
-    }\r
-    __device__ void setLastBit(float& f)\r
-    {\r
-        setLastBit((int*)&f);\r
-    }\r
-\r
-    ////////////////////////////////////////////////////////////////////////\r
     // Global parameters\r
 \r
     // The maximum number of features (before subpixel interpolation) that memory is reserved for.\r
     __constant__ int c_max_candidates;\r
     // The maximum number of features that memory is reserved for.\r
     __constant__ int c_max_features;\r
-    // The number of intervals in the octave.\r
-    __constant__ int c_nIntervals;\r
-    // Mask sizes derived from the mask parameters\r
-    __constant__ float c_mask_width;\r
-    // Mask sizes derived from the mask parameters\r
-    __constant__ float c_mask_height;\r
-    // Mask sizes derived from the mask parameters\r
-    __constant__ float c_dxy_center_offset;\r
-    // Mask sizes derived from the mask parameters\r
-    __constant__ float c_dxy_half_width;\r
-    // Mask sizes derived from the mask parameters\r
-    __constant__ float c_dxy_scale;\r
-    // The scale associated with the first interval of the first octave\r
-    __constant__ float c_initialScale;\r
-    // The interest operator threshold\r
-    __constant__ float c_threshold;\r
-\r
-    // Ther octave\r
+    // The maximum number of keypoints that memory is reserved for.\r
+    __constant__ int c_max_keypoints;\r
+    // The image size.\r
+    __constant__ int c_img_rows;\r
+    __constant__ int c_img_cols;\r
+    // The number of layers.\r
+    __constant__ int c_nOctaveLayers;\r
+    // The hessian threshold.\r
+    __constant__ float c_hessianThreshold;\r
+\r
+    // The current octave.\r
     __constant__ int c_octave;\r
-    // The width of the octave buffer.\r
-    __constant__ int c_x_size;\r
-    // The height of the octave buffer.\r
-    __constant__ int c_y_size;\r
-    // The size of the octave border in pixels.\r
-    __constant__ int c_border;\r
-    // The step size used in this octave in pixels.\r
-    __constant__ int c_step;\r
+    // The current layer size.\r
+    __constant__ int c_layer_rows;\r
+    __constant__ int c_layer_cols;\r
 \r
     ////////////////////////////////////////////////////////////////////////\r
     // Integral image texture\r
 \r
-    texture<float, 2, cudaReadModeElementType> sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp);\r
+    typedef texture<unsigned int, 2, cudaReadModeElementType> IntTex;\r
 \r
-    __device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight)\r
-    {\r
-        float result = 0.f;\r
-\r
-        result += tex2D(sumTex, cx - halfWidth, cy - halfHeight);\r
-        result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight);\r
-        result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight);\r
-        result += tex2D(sumTex, cx + halfWidth, cy + halfHeight);\r
-\r
-        return result;\r
-    }\r
-\r
-    ////////////////////////////////////////////////////////////////////////\r
-    // Hessian\r
+    IntTex sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
 \r
-    __device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale)\r
+    template <int N>\r
+    __device__ float icvCalcHaarPattern(const IntTex& tex, const float src[][5], int oldSize, int newSize, int y, int x)\r
     {\r
-        float Dyy = 0.f;\r
-\r
-        Dyy +=     iiAreaLookupCDHalfWH(x, y, mask_width, mask_height);\r
-        Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale);\r
-\r
-        Dyy *=  1.0f / (fscale * fscale);\r
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
+        typedef double real_t;        \r
+        #else\r
+        typedef float  real_t;\r
+        #endif\r
 \r
-        return Dyy;\r
-    }\r
+        float ratio = (float)newSize / oldSize;\r
+        \r
+        real_t d = 0;\r
 \r
-    __device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale)\r
-    {\r
-        float Dxx = 0.f;\r
+        #pragma unroll\r
+        for (int k = 0; k < N; ++k)\r
+        {\r
+            int dx1 = __float2int_rn(ratio * src[k][0]);\r
+            int dy1 = __float2int_rn(ratio * src[k][1]);\r
+            int dx2 = __float2int_rn(ratio * src[k][2]);\r
+            int dy2 = __float2int_rn(ratio * src[k][3]);\r
 \r
-        Dxx +=     iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);\r
-        Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale     , mask_width);\r
+            real_t t = 0;\r
+            t += tex2D(tex, x + dx1, y + dy1);\r
+            t -= tex2D(tex, x + dx1, y + dy2);\r
+            t -= tex2D(tex, x + dx2, y + dy1);\r
+            t += tex2D(tex, x + dx2, y + dy2);\r
 \r
-        Dxx *=  1.0f / (fscale * fscale);\r
+            d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));\r
+        }\r
 \r
-        return Dxx;\r
+        return (float)d;\r
     }\r
 \r
-    __device__ float evalDxy(float x, float y, float fscale)\r
-    {\r
-        float center_offset =  c_dxy_center_offset  * fscale;\r
-        float half_width    =  c_dxy_half_width     * fscale;\r
-\r
-        float Dxy = 0.f;\r
-\r
-        Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);\r
-        Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);\r
-        Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);\r
-        Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);\r
-\r
-        Dxy *= 1.0f / (fscale * fscale);\r
+    ////////////////////////////////////////////////////////////////////////\r
+    // Hessian\r
 \r
-        return Dxy;\r
-    }\r
+    __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };\r
+    __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };\r
+    __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };\r
 \r
-    __device__ float calcScale(int hidx_z)\r
+    __host__ __device__ int calcSize(int octave, int layer)\r
     {\r
-        float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2);\r
-        return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f;\r
-    }\r
-    \r
-    __global__ void fasthessian(PtrStepf hessianBuffer)\r
-    {\r
-           // Determine the indices in the Hessian buffer\r
-        int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;\r
-        int hidx_y = threadIdx.y + blockIdx.y * blockDim.y;\r
-        int hidx_z = threadIdx.z;\r
-\r
-        float fscale = calcScale(hidx_z);\r
+        /* Wavelet size at first layer of first octave. */\r
+        const int HAAR_SIZE0 = 9;\r
 \r
-        // Compute the lookup location of the mask center\r
-        float x = hidx_x * c_step + c_border;\r
-        float y = hidx_y * c_step + c_border;\r
+        /* Wavelet size increment between layers. This should be an even number,\r
+         such that the wavelet sizes in an octave are either all even or all odd.\r
+         This ensures that when looking for the neighbours of a sample, the layers\r
+         above and below are aligned correctly. */\r
+        const int HAAR_SIZE_INC = 6;\r
 \r
-        // Scale the mask dimensions according to the scale\r
-        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)\r
-        {\r
-            float mask_width =  c_mask_width  * fscale;\r
-            float mask_height = c_mask_height * fscale;\r
-\r
-            // Compute the filter responses\r
-            float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
-            float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
-            float Dxy = evalDxy(x, y, fscale);\r
-\r
-            // Combine the responses and store the Laplacian sign\r
-            float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
-\r
-            if (Dxx + Dyy > 0.f)\r
-                setLastBit(result);\r
-            else\r
-                clearLastBit(result);\r
-\r
-            hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
-        }\r
+        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;\r
     }\r
 \r
-    __global__ void fasthessian_old(PtrStepf hessianBuffer)\r
+    __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)\r
     {\r
-           // Determine the indices in the Hessian buffer\r
-        int gridDim_y = gridDim.y / c_nIntervals;\r
-        int blockIdx_y = blockIdx.y % gridDim_y;\r
-        int blockIdx_z = blockIdx.y / gridDim_y;\r
-\r
-        int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;\r
-        int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;\r
-        int hidx_z = blockIdx_z;\r
+        // Determine the indices\r
+        const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);\r
+        const int blockIdx_y = blockIdx.y % gridDim_y;\r
+        const int blockIdx_z = blockIdx.y / gridDim_y;\r
 \r
-        float fscale = calcScale(hidx_z);\r
+        const int j = threadIdx.x + blockIdx.x * blockDim.x;\r
+        const int i = threadIdx.y + blockIdx_y * blockDim.y;\r
+        const int layer = blockIdx_z;\r
 \r
-        // Compute the lookup location of the mask center\r
-        float x = hidx_x * c_step + c_border;\r
-        float y = hidx_y * c_step + c_border;\r
+        const int size = calcSize(c_octave, layer);\r
 \r
-        // Scale the mask dimensions according to the scale\r
-        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)\r
-        {\r
-            float mask_width =  c_mask_width  * fscale;\r
-            float mask_height = c_mask_height * fscale;\r
-\r
-            // Compute the filter responses\r
-            float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
-            float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
-            float Dxy = evalDxy(x, y, fscale);\r
+        const int samples_i = 1 + ((c_img_rows - size) >> c_octave);\r
+        const int samples_j = 1 + ((c_img_cols - size) >> c_octave);\r
 \r
-            // Combine the responses and store the Laplacian sign\r
-            float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
+        /* Ignore pixels where some of the kernel is outside the image */\r
+        const int margin = (size >> 1) >> c_octave;\r
 \r
-            if (Dxx + Dyy > 0.f)\r
-                setLastBit(result);\r
-            else\r
-                clearLastBit(result);\r
+        if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)\r
+        {\r
+            const float dx  = icvCalcHaarPattern<3>(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);\r
+            const float dy  = icvCalcHaarPattern<3>(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);\r
+            const float dxy = icvCalcHaarPattern<4>(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);\r
 \r
-            hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
+            det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;\r
+            trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;\r
         }\r
     }\r
 \r
-    dim3 calcBlockSize(int nIntervals)\r
-    {\r
-        int threadsPerBlock = 512;\r
-        \r
-        dim3 threads;        \r
-        threads.z = nIntervals;\r
-        threadsPerBlock /= nIntervals;\r
-        if (threadsPerBlock >= 48)\r
-            threads.x = 16;\r
-        else\r
-            threads.x = 8;\r
-        threadsPerBlock /= threads.x;\r
-        threads.y = threadsPerBlock;\r
-        \r
-        return threads;\r
-    }\r
-\r
-    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)\r
+    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)\r
     {\r
-        dim3 grid;\r
-        grid.x = divUp(x_size, threads.x);\r
-        grid.y = divUp(y_size, threads.y);\r
-        \r
-        fasthessian<<<grid, threads>>>(hessianBuffer);\r
-        cudaSafeCall( cudaGetLastError() );\r
+        const int min_size = calcSize(octave, 0);\r
+        const int max_samples_i = 1 + ((img_rows - min_size) >> octave);\r
+        const int max_samples_j = 1 + ((img_cols - min_size) >> octave);\r
 \r
-        cudaSafeCall( cudaThreadSynchronize() );\r
-    }\r
-\r
-    void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)\r
-    {\r
         dim3 threads(16, 16);\r
 \r
         dim3 grid;\r
-        grid.x = divUp(x_size, threads.x);\r
-        grid.y = divUp(y_size, threads.y) * threadsOld.z;\r
-        \r
-        fasthessian_old<<<grid, threads>>>(hessianBuffer);\r
+        grid.x = divUp(max_samples_j, threads.x);\r
+        grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);\r
+\r
+        icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////\r
     // NONMAX\r
     \r
-    texture<int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+    IntTex maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
 \r
     struct WithOutMask\r
     {\r
-        static __device__ bool check(float, float, float)\r
+        static __device__ bool check(int, int, int)\r
         {\r
             return true;\r
         }\r
     };\r
+\r
+    __constant__ float c_DM[1][5] = {{0, 0, 9, 9, 1}};\r
+\r
     struct WithMask\r
     {\r
-        static __device__ bool check(float x, float y, float fscale)\r
+        static __device__ bool check(int sum_i, int sum_j, int size)\r
         {\r
-            float half_width = fscale / 2;\r
-\r
-            float result = 0.f;\r
-\r
-            result += tex2D(maskSumTex, x - half_width, y - half_width);\r
-            result -= tex2D(maskSumTex, x + half_width, y - half_width);\r
-            result -= tex2D(maskSumTex, x - half_width, y + half_width);\r
-            result += tex2D(maskSumTex, x + half_width, y + half_width);\r
-\r
-            result /= (fscale * fscale);\r
-\r
-            return (result >= 0.5f);\r
+            float mval = icvCalcHaarPattern<1>(maskSumTex, c_DM , 9, size, sum_i, sum_j);\r
+            return (mval >= 0.5);\r
         }\r
     };\r
 \r
     template <typename Mask>\r
-    __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)\r
+    __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)\r
     {\r
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
 \r
-        extern __shared__ float fh_vals[];\r
+        extern __shared__ float N9[];\r
 \r
         // The hidx variables are the indices to the hessian buffer.\r
-        int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2);\r
-        int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2);\r
-        int hidx_z = threadIdx.z;\r
-        int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;\r
+        const int gridDim_y = gridDim.y / c_nOctaveLayers;\r
+        const int blockIdx_y = blockIdx.y % gridDim_y;\r
+        const int blockIdx_z = blockIdx.y / gridDim_y;\r
 \r
-        // Is this thread within the hessian buffer?\r
-        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)\r
-        {\r
-            fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];\r
-        }\r
-        __syncthreads();\r
+        const int layer = blockIdx_z + 1;\r
+\r
+        const int size = calcSize(c_octave, layer);\r
 \r
-        // Is this location one of the ones being processed for nonmax suppression.\r
-        // Blocks overlap by one so we don't process the border threads.\r
-        bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1 \r
-            &&           threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1\r
-            &&           threadIdx.z > 0 && threadIdx.z < blockDim.z-1;\r
+        /* Ignore pixels without a 3x3x3 neighbourhood in the layer above */\r
+        const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;\r
 \r
-        float val = fh_vals[localLin];\r
+        const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;\r
+        const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;\r
 \r
-        // Compute the lookup location of the mask center\r
-        float x = hidx_x * c_step + c_border;\r
-        float y = hidx_y * c_step + c_border;\r
-        float fscale = calcScale(hidx_z);\r
+        // Is this thread within the hessian buffer?\r
+        const int zoff = blockDim.x * blockDim.y;\r
+        const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;\r
+        N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + i)[j];\r
+        N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + i)[j];\r
+        N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + i)[j];\r
+        __syncthreads();\r
 \r
-        if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale))\r
+        if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)\r
         {\r
-            // Check to see if we have a max (in its 26 neighbours)\r
-            int zoff = blockDim.x * blockDim.y;\r
-            bool condmax  =  val > fh_vals[localLin                     + 1]\r
-            &&               val > fh_vals[localLin                     - 1]\r
-            &&               val > fh_vals[localLin        - blockDim.x + 1]\r
-            &&               val > fh_vals[localLin        - blockDim.x    ]\r
-            &&               val > fh_vals[localLin        - blockDim.x - 1]\r
-            &&               val > fh_vals[localLin        + blockDim.x + 1]\r
-            &&               val > fh_vals[localLin        + blockDim.x    ]\r
-            &&               val > fh_vals[localLin        + blockDim.x - 1]\r
-\r
-            &&               val > fh_vals[localLin - zoff              + 1]\r
-            &&               val > fh_vals[localLin - zoff                 ]\r
-            &&               val > fh_vals[localLin - zoff              - 1]\r
-            &&               val > fh_vals[localLin - zoff - blockDim.x + 1]\r
-            &&               val > fh_vals[localLin - zoff - blockDim.x    ]\r
-            &&               val > fh_vals[localLin - zoff - blockDim.x - 1]\r
-            &&               val > fh_vals[localLin - zoff + blockDim.x + 1]\r
-            &&               val > fh_vals[localLin - zoff + blockDim.x    ]\r
-            &&               val > fh_vals[localLin - zoff + blockDim.x - 1]\r
-\r
-            &&               val > fh_vals[localLin + zoff              + 1]\r
-            &&               val > fh_vals[localLin + zoff                 ]\r
-            &&               val > fh_vals[localLin + zoff              - 1]\r
-            &&               val > fh_vals[localLin + zoff - blockDim.x + 1]\r
-            &&               val > fh_vals[localLin + zoff - blockDim.x    ]\r
-            &&               val > fh_vals[localLin + zoff - blockDim.x - 1]\r
-            &&               val > fh_vals[localLin + zoff + blockDim.x + 1]\r
-            &&               val > fh_vals[localLin + zoff + blockDim.x    ]\r
-            &&               val > fh_vals[localLin + zoff + blockDim.x - 1]\r
-            ;\r
-\r
-            if(condmax) \r
+            float val0 = N9[localLin];\r
+\r
+            if (val0 > c_hessianThreshold)\r
             {\r
-                unsigned int i = atomicInc(maxCounter,(unsigned int) -1);\r
+                // Coordinates for the start of the wavelet in the sum image. There\r
+                // is some integer division involved, so don't try to simplify this\r
+                // (cancel out sampleStep) without checking the result is the same\r
+                const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;\r
+                const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;\r
 \r
-                if (i < c_max_candidates) \r
+                if (Mask::check(sum_i, sum_j, size))\r
                 {\r
-                    int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};\r
-                    maxPosBuffer[i] = f;\r
+                    // Check to see if we have a max (in its 26 neighbours)\r
+                    const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]\r
+                    &&                   val0 > N9[localLin     - blockDim.x - zoff]\r
+                    &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]\r
+                    &&                   val0 > N9[localLin - 1              - zoff]\r
+                    &&                   val0 > N9[localLin                  - zoff]\r
+                    &&                   val0 > N9[localLin + 1              - zoff]\r
+                    &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]\r
+                    &&                   val0 > N9[localLin     + blockDim.x - zoff]\r
+                    &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]\r
+\r
+                    &&                   val0 > N9[localLin - 1 - blockDim.x]\r
+                    &&                   val0 > N9[localLin     - blockDim.x]\r
+                    &&                   val0 > N9[localLin + 1 - blockDim.x]\r
+                    &&                   val0 > N9[localLin - 1             ]\r
+                    &&                   val0 > N9[localLin + 1             ]\r
+                    &&                   val0 > N9[localLin - 1 + blockDim.x]\r
+                    &&                   val0 > N9[localLin     + blockDim.x]\r
+                    &&                   val0 > N9[localLin + 1 + blockDim.x]\r
+\r
+                    &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]\r
+                    &&                   val0 > N9[localLin     - blockDim.x + zoff]\r
+                    &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]\r
+                    &&                   val0 > N9[localLin - 1              + zoff]\r
+                    &&                   val0 > N9[localLin                  + zoff]\r
+                    &&                   val0 > N9[localLin + 1              + zoff]\r
+                    &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]\r
+                    &&                   val0 > N9[localLin     + blockDim.x + zoff]\r
+                    &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]\r
+                    ;\r
+\r
+                    if(condmax)\r
+                    {\r
+                        unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);\r
+\r
+                        if (ind < c_max_candidates)\r
+                        {\r
+                            const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);\r
+\r
+                            maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);\r
+                        }\r
+                    }\r
                 }\r
             }\r
         }\r
@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf
         #endif\r
     }\r
 \r
-    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
-        int x_size, int y_size, bool use_mask, const dim3& threads)\r
+    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
+        int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)\r
     {\r
-        dim3 grid;\r
-        grid.x = divUp(x_size, threads.x - 2);\r
-        grid.y = divUp(y_size, threads.y - 2);\r
+        const int layer_rows = img_rows >> octave;\r
+        const int layer_cols = img_cols >> octave;\r
+\r
+        int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;\r
+\r
+        dim3 threads(16, 16);\r
 \r
-        const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);\r
+        dim3 grid;\r
+        grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);\r
+        grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;\r
 \r
-        DeviceReference<unsigned int> maxCounterWrapper(maxCounter);\r
+        const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);\r
 \r
         if (use_mask)\r
-            nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
+            icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
         else\r
-            nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
+            icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
 \r
         cudaSafeCall( cudaGetLastError() );\r
 \r
@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////\r
     // INTERPOLATION\r
     \r
-    #define MID_IDX 1\r
-    __global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer, \r
-        KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)\r
+    __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)\r
     {\r
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
 \r
-        int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x;\r
-        int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y;\r
-        int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z;\r
+        const int4 maxPos = maxPosBuffer[blockIdx.x];\r
+\r
+        const int j = maxPos.x - 1 + threadIdx.x;\r
+        const int i = maxPos.y - 1 + threadIdx.y;\r
+        const int layer = maxPos.z - 1 + threadIdx.z;\r
 \r
-        __shared__ float fh_vals[3][3][3];\r
+        __shared__ float N9[3][3][3];\r
         __shared__ KeyPoint_GPU p;\r
 \r
-        fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];\r
+         N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];\r
         __syncthreads();\r
 \r
         if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)\r
         {\r
-            __shared__ float H[3][3];\r
-\r
-            //dxx\r
-            H[0][0] =    fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX    ] \r
-            -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]\r
-            +            fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX    ];\r
+            __shared__ float dD[3];\r
 \r
-            //dyy\r
-            H[1][1] =    fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX + 1]\r
-            -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]\r
-            +            fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX - 1];\r
+            //dx\r
+            dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);\r
+            //dy\r
+            dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);\r
+            //ds\r
+            dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);\r
 \r
-            //dss\r
-            H[2][2] =    fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX    ] \r
-            -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]\r
-            +            fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX    ];\r
+            __shared__ float H[3][3];\r
 \r
+            //dxx\r
+            H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];\r
             //dxy\r
-            H[0][1]= 0.25f*\r
-                (fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX + 1] -\r
-                 fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX + 1] -\r
-                 fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX - 1] +\r
-                 fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX - 1]);\r
-\r
+            H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);\r
             //dxs\r
-            H[0][2]= 0.25f*\r
-                (fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX    ] -\r
-                 fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX    ] -\r
-                 fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX    ] +\r
-                 fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX    ]);\r
-\r
-            //dys\r
-            H[1][2]= 0.25f*\r
-                (fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX + 1] -\r
-                 fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX - 1] -\r
-                 fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX + 1] +\r
-                 fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX - 1]);\r
-\r
+            H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);\r
             //dyx = dxy\r
             H[1][0] = H[0][1];\r
-\r
+            //dyy\r
+            H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];\r
+            //dys\r
+            H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);\r
             //dsx = dxs\r
             H[2][0] = H[0][2];\r
-\r
             //dsy = dys\r
             H[2][1] = H[1][2];\r
+            //dss\r
+            H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];\r
 \r
-            __shared__ float dD[3];\r
+            float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])\r
+              - H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])\r
+              + H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);\r
 \r
-            //dx\r
-            dD[0] = 0.5f*(fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX    ] -\r
-                fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX    ]);\r
-            //dy\r
-            dD[1] = 0.5f*(fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX + 1] -\r
-                fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX - 1]);\r
-            //ds\r
-            dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX    ] -\r
-                fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX    ]);\r
-\r
-            __shared__ float invdet;\r
-            invdet = 1.f /\r
-                (\r
-                H[0][0]*H[1][1]*H[2][2] \r
-                +   H[0][1]*H[1][2]*H[2][0]\r
-                +   H[0][2]*H[1][0]*H[2][1]\r
-                -   H[0][0]*H[1][2]*H[2][1]\r
-                -   H[0][1]*H[1][0]*H[2][2]\r
-                -   H[0][2]*H[1][1]*H[2][0]\r
-                );\r
-\r
-            //   // 1-based entries of a 3x3 inverse\r
-            //   /*             [ |a22 a23|   |a12 a13|  |a12 a13|]     */\r
-            //   /*             [ |a32 a33|  -|a32 a33|  |a22 a23|]     */\r
-            //   /*             [                                 ]     */\r
-            //   /*             [ |a21 a23|   |a11 a13|  |a11 a13|]     */\r
-            //   /*    A^(-1) = [-|a31 a33|   |a31 a33| -|a21 a23|] / d */\r
-            //   /*             [                                 ]     */\r
-            //   /*             [ |a21 a22|   |a11 a12|  |a11 a12|]     */\r
-            //   /*             [ |a31 a32|  -|a31 a32|  |a21 a22|]     */\r
-\r
-            __shared__ float Hinv[3][3];\r
-            Hinv[0][0] =  invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]);\r
-            Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]);\r
-            Hinv[0][2] =  invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]);\r
-\r
-            Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]);\r
-            Hinv[1][1] =  invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]);\r
-            Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]);\r
-\r
-            Hinv[2][0] =  invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]);\r
-            Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]);\r
-            Hinv[2][2] =  invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]);\r
-\r
-            __shared__ float x[3];\r
-\r
-            x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2]));\r
-            x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2]));\r
-            x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));\r
-\r
-            if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f) \r
+            if (det != 0.0f)\r
             {\r
-                // if the step is within the interpolation region, perform it\r
+                float invdet = 1.0f / det;\r
 \r
-                // Get a new feature index.\r
-                unsigned int i = atomicInc(featureCounter, (unsigned int)-1);\r
+                __shared__ float x[3];\r
 \r
-                if (i < c_max_features)\r
+                x[0] = invdet * \r
+                    (dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -\r
+                     H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +\r
+                     H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));\r
+\r
+                x[1] = invdet * \r
+                    (H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -\r
+                     dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +\r
+                     H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));\r
+\r
+                x[2] = invdet * \r
+                    (H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -\r
+                     H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +\r
+                     dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));\r
+\r
+                if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)\r
                 {\r
-                    p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;\r
-                    p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;\r
+                    // if the step is within the interpolation region, perform it\r
 \r
-                    if (x[2] > 0)\r
-                    {\r
-                        float a = calcScale(maxPosBuffer[blockIdx.x].z);\r
-                        float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);\r
+                    // Get a new feature index.\r
+                    unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);\r
 \r
-                        p.size = (1.f - x[2]) * a + x[2] * b;\r
-                    }\r
-                    else\r
+                    if (ind < c_max_features)\r
                     {\r
-                        float a = calcScale(maxPosBuffer[blockIdx.x].z);\r
-                        float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);\r
+                        const int size = calcSize(c_octave, maxPos.z);\r
 \r
-                        p.size = (1.f + x[2]) * a - x[2] * b;\r
-                    }\r
+                        const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;\r
+                        const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;\r
+                        \r
+                        const float center_i = sum_i + (float)(size - 1) / 2;\r
+                        const float center_j = sum_j + (float)(size - 1) / 2;\r
 \r
-                    p.octave = c_octave;\r
+                        p.x = center_j + x[0] * (1 << c_octave);\r
+                        p.y = center_i + x[1] * (1 << c_octave);\r
 \r
-                    p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];\r
+                        int ds = size - calcSize(c_octave, maxPos.z - 1);\r
+                        p.size = roundf(size + x[2] * ds);\r
 \r
-                    // Should we split up this transfer over many threads?\r
-                    featuresBuffer[i] = p;\r
-                }\r
-            } // If the subpixel interpolation worked\r
+                        p.laplacian = maxPos.w;\r
+                        p.dir = 0.0f;\r
+                        p.hessian = N9[1][1][1];\r
+\r
+                        // Should we split up this transfer over many threads?\r
+                        featuresBuffer[ind] = p;\r
+                    }\r
+                } // If the subpixel interpolation worked\r
+            }\r
         } // If this is thread 0.\r
 \r
         #endif\r
     }\r
-    #undef MID_IDX\r
 \r
-    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, \r
-        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter)\r
+    void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)\r
     {\r
         dim3 threads;\r
         threads.x = 3;\r
@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf
         dim3 grid;\r
         grid.x = maxCounter;\r
 \r
-        DeviceReference<unsigned int> featureCounterWrapper(featureCounter);\r
-\r
-        fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);\r
+        icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featuresBuffer, featureCounter);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////\r
     // Orientation\r
 \r
-    // precomputed values for a Gaussian with a standard deviation of 2\r
-    __constant__ float c_gauss1D[13] = \r
-    {\r
-        0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f, \r
-        0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f, \r
-        0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f, \r
-        0.002215924206f\r
-    };\r
+    #define ORI_SEARCH_INC 5\r
+    #define ORI_WIN        60\r
+    #define ORI_SAMPLES    113\r
 \r
-    __global__ void find_orientation(KeyPoint_GPU* features)\r
-    {\r
-        int tid = threadIdx.y * 17 + threadIdx.x;\r
-        int tid2 = numeric_limits_gpu<int>::max();\r
+    __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};\r
+    __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};\r
+    __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};\r
+    \r
+    __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
+    __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
 \r
-        if (threadIdx.x < 13 && threadIdx.y < 13) \r
-        {\r
-            tid2 = threadIdx.y * 13 + threadIdx.x;\r
-        }\r
+    __global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)\r
+    {        \r
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
 \r
-        __shared__ float texLookups[17][17];\r
+        __shared__ float s_X[128];\r
+        __shared__ float s_Y[128];\r
+        __shared__ float s_angle[128];\r
 \r
-        __shared__ float Edx[13*13];\r
-        __shared__ float Edy[13*13];\r
-        __shared__ float xys[3];\r
+        __shared__ float s_sumx[64 * 4];\r
+        __shared__ float s_sumy[64 * 4];\r
 \r
-        // Read my x, y, size.\r
-        if (tid < 3)\r
-        {\r
-            xys[tid] = ((float*)(&features[blockIdx.x]))[tid];\r
-        }\r
+        __shared__ float s_feature[6];\r
+\r
+        if (threadIdx.x < 6 && threadIdx.y == 0)\r
+            s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x];\r
         __syncthreads();\r
 \r
-        // Read all texture locations into memory\r
-        // Maybe I should use __mul24 here?\r
-        texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE], \r
-                  xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]);\r
 \r
-        __syncthreads();\r
+        /* The sampling intervals and wavelet sized for selecting an orientation\r
+         and building the keypoint descriptor are defined relative to 's' */\r
+        const float s = s_feature[SF_SIZE] * 1.2f / 9.0f;\r
 \r
-        float dx = 0.f;\r
-        float dy = 0.f;\r
+        /* To find the dominant orientation, the gradients in x and y are\r
+         sampled in a circle of radius 6s using wavelets of size 4s.\r
+         We ensure the gradient wavelet size is even to ensure the\r
+         wavelet pattern is balanced and symmetric around its center */\r
+        const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
 \r
-        // Computes lookups for all points in a 13x13 lattice.\r
-        // - SURF says to only use a circle, but the branching logic would slow it down\r
-        // - Gaussian weighting should reduce the effects of the outer points anyway\r
-        if (tid2 < 169)\r
+        // check when grad_wav_size is too big\r
+        if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
         {\r
-            dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];\r
-            dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];\r
-            dx -=     texLookups[threadIdx.x + 4][threadIdx.y    ];\r
-            dx +=     texLookups[threadIdx.x    ][threadIdx.y + 4];\r
-            dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];\r
-            dx +=     texLookups[threadIdx.x + 4][threadIdx.y + 4];\r
-\r
-            dy -=     texLookups[threadIdx.x    ][threadIdx.y    ];\r
-            dy += 2.f*texLookups[threadIdx.x    ][threadIdx.y + 2];\r
-            dy -=     texLookups[threadIdx.x    ][threadIdx.y + 4];\r
-            dy +=     texLookups[threadIdx.x + 4][threadIdx.y    ];\r
-            dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];\r
-            dy +=     texLookups[threadIdx.x + 4][threadIdx.y + 4];\r
-\r
-            float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];\r
-\r
-            Edx[tid2] = dx * g;\r
-            Edy[tid2] = dy * g;\r
-        }\r
+            // Calc X, Y, angle and store it to shared memory\r
+            {\r
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-        __syncthreads();\r
+                float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
 \r
-        // This is a scan to get the summed dx, dy values.\r
-        // Gets 128-168\r
-        if (tid < 41)\r
-        {\r
-            Edx[tid] += Edx[tid + 128];\r
-        } \r
-        __syncthreads();\r
-        if (tid < 64)\r
-        {\r
-            Edx[tid] += Edx[tid + 64];\r
-        }\r
-        __syncthreads();\r
-        if (tid < 32)\r
-        {\r
-            volatile float* smem = Edx;\r
-\r
-            smem[tid] += smem[tid + 32];\r
-            smem[tid] += smem[tid + 16];\r
-            smem[tid] += smem[tid + 8];\r
-            smem[tid] += smem[tid + 4];\r
-            smem[tid] += smem[tid + 2];\r
-            smem[tid] += smem[tid + 1];\r
-        }\r
+                if (tid < ORI_SAMPLES)\r
+                {\r
+                    const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
+                    const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin);\r
+                    const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin);\r
 \r
-        // Gets 128-168\r
-        if (tid < 41) \r
-        {\r
-            Edy[tid] += Edy[tid + 128];\r
-        }\r
-        __syncthreads();\r
-        if (tid < 64)\r
-        {\r
-            Edy[tid] += Edy[tid + 64];\r
-        }\r
-        __syncthreads();\r
-        if (tid < 32)\r
-        {\r
-            volatile float* smem = Edy;\r
-\r
-            smem[tid] += smem[tid + 32];\r
-            smem[tid] += smem[tid + 16];\r
-            smem[tid] += smem[tid + 8];\r
-            smem[tid] += smem[tid + 4];\r
-            smem[tid] += smem[tid + 2];\r
-            smem[tid] += smem[tid + 1];\r
-        }\r
+                    if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
+                    {\r
+                        X = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NX, 4, grad_wav_size, y, x);\r
+                        Y = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NY, 4, grad_wav_size, y, x);\r
+                    \r
+                        angle = atan2f(Y, X);\r
+                        if (angle < 0)\r
+                            angle += 2.0f * CV_PI;\r
+                        angle *= 180.0f / CV_PI;\r
+                    }\r
+                }\r
+                if (tid < 128)\r
+                {\r
+                    s_X[tid] = X;\r
+                    s_Y[tid] = Y;\r
+                    s_angle[tid] = angle;\r
+                }\r
+            }\r
+            __syncthreads();\r
 \r
-        // Thread 0 saves back the result.\r
-        if (tid == 0)\r
-        {\r
-            features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);\r
+            float bestx = 0, besty = 0, best_mod = 0;\r
+\r
+            #pragma unroll\r
+            for (int i = 0; i < 18; ++i)\r
+            {\r
+                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;\r
+\r
+                float sumx = 0.0f, sumy = 0.0f;\r
+                int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir);\r
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+                {\r
+                    sumx = s_X[threadIdx.x];\r
+                    sumy = s_Y[threadIdx.x];\r
+                }\r
+                d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+                {\r
+                    sumx += s_X[threadIdx.x + 64];\r
+                    sumy += s_Y[threadIdx.x + 64];\r
+                }\r
+\r
+                float* s_sumx_row = s_sumx + threadIdx.y * 64;\r
+                float* s_sumy_row = s_sumy + threadIdx.y * 64;\r
+\r
+                s_sumx_row[threadIdx.x] = sumx;\r
+                s_sumy_row[threadIdx.x] = sumy;\r
+                __syncthreads();\r
+\r
+                if (threadIdx.x < 32)\r
+                {\r
+                    volatile float* v_sumx_row = s_sumx_row;\r
+                    volatile float* v_sumy_row = s_sumy_row;\r
+\r
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];\r
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];\r
+\r
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];\r
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];\r
+\r
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];\r
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];\r
+\r
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];\r
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];\r
+\r
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];\r
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];\r
+\r
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];\r
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];\r
+                }\r
+\r
+                const float temp_mod = sumx * sumx + sumy * sumy;\r
+                if (temp_mod > best_mod)\r
+                {\r
+                    best_mod = temp_mod;\r
+                    bestx = sumx;\r
+                    besty = sumy;\r
+                }\r
+                __syncthreads();\r
+            }\r
+\r
+            if (threadIdx.x == 0)\r
+            {\r
+                s_X[threadIdx.y] = bestx;\r
+                s_Y[threadIdx.y] = besty;\r
+                s_angle[threadIdx.y] = best_mod;\r
+            }\r
+            __syncthreads();\r
+\r
+            if (threadIdx.x < 2 && threadIdx.y == 0)\r
+            {\r
+                volatile float* v_x = s_X;\r
+                volatile float* v_y = s_Y;\r
+                volatile float* v_mod = s_angle;\r
+\r
+                bestx = v_x[threadIdx.x];\r
+                besty = v_y[threadIdx.x];\r
+                best_mod = v_mod[threadIdx.x];\r
+\r
+                float temp_mod = v_mod[threadIdx.x + 2];\r
+                if (temp_mod > best_mod)\r
+                {\r
+                    v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];\r
+                    v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];\r
+                    v_mod[threadIdx.x] = best_mod = temp_mod;\r
+                }\r
+                temp_mod = v_mod[threadIdx.x + 1];\r
+                if (temp_mod > best_mod)\r
+                {\r
+                    v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];\r
+                    v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];\r
+                }\r
+            }\r
+\r
+            if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)\r
+            {\r
+                // Get a new feature index.\r
+                unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1);\r
+\r
+                if (ind < c_max_keypoints)\r
+                {\r
+                    float kp_dir = atan2f(besty, bestx);\r
+                    if (kp_dir < 0)\r
+                        kp_dir += 2.0f * CV_PI;\r
+                    kp_dir *= 180.0f / CV_PI;\r
+                    __shared__ KeyPoint_GPU kp;\r
+\r
+                    kp.x = s_feature[SF_X];\r
+                    kp.y = s_feature[SF_Y];\r
+                    kp.laplacian = s_feature[SF_LAPLACIAN];\r
+                    kp.size = s_feature[SF_SIZE];\r
+                    kp.dir = kp_dir;\r
+                    kp.hessian = s_feature[SF_HESSIAN];\r
+\r
+                    keypoints[ind] = kp;\r
+                }\r
+            }\r
         }\r
+\r
+        #endif\r
     }\r
 \r
-    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures) \r
+    #undef ORI_SEARCH_INC\r
+    #undef ORI_WIN\r
+    #undef ORI_SAMPLES\r
+\r
+    void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter) \r
     {\r
         dim3 threads;\r
-        threads.x = 17;\r
-        threads.y = 17;\r
+        threads.x = 64;\r
+        threads.y = 4;\r
 \r
         dim3 grid;\r
         grid.x = nFeatures;\r
 \r
-        find_orientation<<<grid, threads>>>(features);\r
+        icvCalcOrientation<<<grid, threads>>>(featureBuffer, keypoints, keypointCounter);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////\r
     // Descriptors\r
 \r
-    // precomputed values for a Gaussian with a standard deviation of 3.3\r
-    // - it appears SURF uses a different value, but not sure what it is\r
-    __constant__ float c_3p3gauss1D[20] =\r
-    {\r
-        0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,\r
-        0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,\r
-        0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,\r
-        0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f\r
+    #define PATCH_SZ 20\r
+\r
+    texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+\r
+    __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = \r
+    {\r
+        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, \r
+        8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
+        1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
+        3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
+        5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
+        9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
+        0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
+        0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
+        0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
+        0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
+        0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
+        0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
+        0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
+        0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
+        9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
+        5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
+        3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
+        1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
+        8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
+        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
     };\r
 \r
-    template <int BLOCK_DIM_X>\r
-    __global__ void normalize_descriptors(PtrStepf descriptors)\r
+    __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)\r
     {\r
-        // no need for thread ID\r
-        float* descriptor_base = descriptors.ptr(blockIdx.x);\r
+        const float centerX = s_pt[SF_X];\r
+        const float centerY = s_pt[SF_Y];\r
+        const float size = s_pt[SF_SIZE];\r
+        const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);\r
 \r
-        // read in the unnormalized descriptor values (squared)\r
-        __shared__ float sqDesc[BLOCK_DIM_X];\r
-        const float lookup = descriptor_base[threadIdx.x];\r
-        sqDesc[threadIdx.x] = lookup * lookup;\r
-        __syncthreads();\r
+        /* The sampling intervals and wavelet sized for selecting an orientation\r
+         and building the keypoint descriptor are defined relative to 's' */\r
+        const float s = size * 1.2f / 9.0f;\r
 \r
-        if (BLOCK_DIM_X >= 128)\r
-        {\r
-            if (threadIdx.x < 64)\r
-                sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
-            __syncthreads();\r
-        }\r
+        /* Extract a window of pixels around the keypoint of size 20s */\r
+        const int win_size = (int)((PATCH_SZ + 1) * s);\r
 \r
-        // reduction to get total\r
-        if (threadIdx.x < 32)\r
-        {\r
-            volatile float* smem = sqDesc;\r
+        float sin_dir;\r
+        float cos_dir;\r
+        sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
 \r
-            smem[threadIdx.x] += smem[threadIdx.x + 32];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 16];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 8];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 4];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 2];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 1];\r
-        }\r
+        /* Nearest neighbour version (faster) */\r
+        const float win_offset = -(float)(win_size - 1) / 2; \r
 \r
-        // compute length (square root)\r
-        __shared__ float len;\r
-        if (threadIdx.x == 0)\r
-        {\r
-            len = sqrtf(sqDesc[0]);\r
-        }\r
-        __syncthreads();\r
+        /* Scale the window to size PATCH_SZ so each pixel's size is s. This\r
+           makes calculating the gradients with wavelets of size 2s easy */\r
+        const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;\r
+        const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;\r
 \r
-        // normalize and store in output\r
-        descriptor_base[threadIdx.x] = lookup / len;\r
-    }\r
+        const int i = __float2int_rd(icoo);\r
+        const int j = __float2int_rd(jcoo);\r
 \r
-    __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,\r
-                               int xIndex, int yIndex, int tid)\r
-    {\r
-        float sin_theta, cos_theta;\r
-        sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);\r
-\r
-        // Compute rotated sampling points\r
-        // (clockwise rotation since we are rotating the lattice)\r
-        // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)\r
-        const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]\r
-            + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
-        const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]\r
-            + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
-\r
-        // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)\r
-        // a b c\r
-        // d   f\r
-        // g h i\r
-        const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
-        const float b = tex2D(sumTex, sample_x,                sample_y - ipt[SF_SIZE]);\r
-        const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
-        const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);\r
-        const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);\r
-        const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);\r
-        const float h = tex2D(sumTex, sample_x,                sample_y + ipt[SF_SIZE]);\r
-        const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);\r
-\r
-        // compute axis-aligned HaarX, HaarY\r
-        // (could group the additions together into multiplications)\r
-        const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)\r
-        const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));            // unrotated dx\r
-        const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));            // unrotated dy\r
-\r
-        // rotate responses (store all dxs then all dys)\r
-        // - counterclockwise rotation to rotate back to zero orientation\r
-        sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx\r
-        sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy\r
+        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
+        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
+\r
+        float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);\r
+\r
+        pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;\r
+        pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;\r
+\r
+        res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);\r
+\r
+        pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;\r
+        pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;\r
+\r
+        res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);\r
+        \r
+        pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;\r
+        pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;\r
+\r
+        res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);\r
+\r
+        s_PATCH[i2][j2] = (unsigned char)res;\r
     }\r
 \r
-    __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)\r
+    __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)\r
     {\r
         // get the interest point parameters (x, y, size, response, angle)\r
-        __shared__ float ipt[5];\r
-        if (threadIdx.x < 5 && threadIdx.y == 0)\r
+        __shared__ float s_pt[5];\r
+        if (tid < 5)\r
         {\r
-            ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];\r
+            s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];\r
         }\r
         __syncthreads();\r
 \r
         // Compute sampling points\r
         // since grids are 2D, need to compute xBlock and yBlock indices\r
-        const int xBlock = (threadIdx.y & 3);  // threadIdx.y % 4\r
-        const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)\r
-        const int xIndex = (xBlock * 5) + (threadIdx.x % 5);\r
-        const int yIndex = (yBlock * 5) + (threadIdx.x / 5);\r
+        const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4\r
+        const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
+        const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
+        const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
+\r
+        calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);\r
+        if (threadIdx.x == 0)\r
+            calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);\r
+        if (threadIdx.y == 0)\r
+            calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);\r
+        if (threadIdx.x == 0 && threadIdx.y == 0)\r
+            calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);\r
+        __syncthreads();\r
+\r
+        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
+\r
+        const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;\r
+        const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;\r
 \r
-        calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);\r
+        s_dx_bin[tid] = vx;\r
+        s_dy_bin[tid] = vy;\r
     }\r
 \r
     __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,\r
@@ -934,192 +837,8 @@ namespace cv { namespace gpu { namespace surf
     // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
     __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)\r
     {\r
-        // 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)\r
-        __shared__ float sdx   [16 * 25];\r
-        __shared__ float sdy   [16 * 25];\r
-        __shared__ float sdxabs[16 * 25];\r
-        __shared__ float sdyabs[16 * 25];\r
-\r
-        __shared__ float sdesc[64];\r
-\r
-        float* sdx_bin    = sdx    + (threadIdx.y * 25);\r
-        float* sdy_bin    = sdy    + (threadIdx.y * 25);\r
-        float* sdxabs_bin = sdxabs + (threadIdx.y * 25);\r
-        float* sdyabs_bin = sdyabs + (threadIdx.y * 25);\r
-\r
-        calc_dx_dy(sdx_bin, sdy_bin, features);\r
-        __syncthreads();\r
-\r
-        sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array\r
-        sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array\r
-        __syncthreads();\r
-\r
-        reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);\r
-        __syncthreads();\r
-\r
-        float* sdesc_bin = sdesc + (threadIdx.y << 2);\r
-\r
-        // write dx, dy, |dx|, |dy|\r
-        if (threadIdx.x == 0)\r
-        {\r
-            sdesc_bin[0] = sdx_bin[0];\r
-            sdesc_bin[1] = sdy_bin[0];\r
-            sdesc_bin[2] = sdxabs_bin[0];\r
-            sdesc_bin[3] = sdyabs_bin[0];\r
-        }\r
-        __syncthreads();\r
-\r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-        if (tid < 64)\r
-            descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];\r
-    }\r
-\r
-    // Spawn 16 blocks per interest point\r
-    // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
-    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
-    {\r
-        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
-        __shared__ float sdx[16 * 25];\r
-        __shared__ float sdy[16 * 25];\r
-\r
-        // sum (reduce) 5x5 area response\r
-        __shared__ float sd1[16 * 25];\r
-        __shared__ float sd2[16 * 25];\r
-        __shared__ float sdabs1[16 * 25];\r
-        __shared__ float sdabs2[16 * 25];\r
-\r
-        __shared__ float sdesc[128];\r
-\r
-        float* sdx_bin    = sdx    + (threadIdx.y * 25);\r
-        float* sdy_bin    = sdy    + (threadIdx.y * 25);\r
-        float* sd1_bin    = sd1    + (threadIdx.y * 25);\r
-        float* sd2_bin    = sd2    + (threadIdx.y * 25);\r
-        float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);\r
-        float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);\r
-\r
-        calc_dx_dy(sdx_bin, sdy_bin, features);\r
-        __syncthreads();\r
-\r
-        if (sdy_bin[threadIdx.x] >= 0)\r
-        {\r
-            sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];\r
-            sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);\r
-            sd2_bin[threadIdx.x] = 0;\r
-            sdabs2_bin[threadIdx.x] = 0;\r
-        }\r
-        else\r
-        {\r
-            sd1_bin[threadIdx.x] = 0;\r
-            sdabs1_bin[threadIdx.x] = 0;\r
-            sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];\r
-            sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);\r
-        }\r
-        __syncthreads();\r
-\r
-        reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);\r
-        __syncthreads();\r
-\r
-        float* sdesc_bin = sdesc + (threadIdx.y << 3);\r
-\r
-        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
-        if (threadIdx.x == 0)\r
-        {\r
-                sdesc_bin[0] = sd1_bin[0];\r
-                sdesc_bin[1] = sdabs1_bin[0];\r
-                sdesc_bin[2] = sd2_bin[0];\r
-                sdesc_bin[3] = sdabs2_bin[0];\r
-        }\r
-        __syncthreads();\r
-\r
-        if (sdx_bin[threadIdx.x] >= 0)\r
-        {\r
-            sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];\r
-            sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);\r
-            sd2_bin[threadIdx.x] = 0;\r
-            sdabs2_bin[threadIdx.x] = 0;\r
-        }\r
-        else\r
-        {\r
-            sd1_bin[threadIdx.x] = 0;\r
-            sdabs1_bin[threadIdx.x] = 0;\r
-            sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];\r
-            sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);\r
-        }\r
-        __syncthreads();\r
-\r
-        reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);\r
-        __syncthreads();\r
-\r
-        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
-        if (threadIdx.x == 0)\r
-        {\r
-                sdesc_bin[4] = sd1_bin[0];\r
-                sdesc_bin[5] = sdabs1_bin[0];\r
-                sdesc_bin[6] = sd2_bin[0];\r
-                sdesc_bin[7] = sdabs2_bin[0];\r
-        }\r
-        __syncthreads();\r
-\r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-        if (tid < 128)\r
-            descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];\r
-    }\r
-\r
-    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)\r
-    {\r
-        // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
-        \r
-        if (descriptors.cols == 64)\r
-        {\r
-            compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);\r
-            cudaSafeCall( cudaGetLastError() );\r
-\r
-            cudaSafeCall( cudaThreadSynchronize() );\r
-\r
-            normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
-            cudaSafeCall( cudaGetLastError() );\r
-\r
-            cudaSafeCall( cudaThreadSynchronize() );\r
-        }\r
-        else\r
-        {\r
-            compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);\r
-            cudaSafeCall( cudaGetLastError() );\r
-\r
-            cudaSafeCall( cudaThreadSynchronize() );\r
-\r
-            normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);\r
-            cudaSafeCall( cudaGetLastError() );\r
-\r
-            cudaSafeCall( cudaThreadSynchronize() );\r
-        }\r
-    }\r
-\r
-    __device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)\r
-    {\r
-        // get the interest point parameters (x, y, scale, strength, theta)\r
-        __shared__ float ipt[5];\r
-        if (tid < 5)\r
-        {\r
-            ipt[tid] = ((float*)&features[blockIdx.x])[tid];\r
-        }\r
-        __syncthreads();\r
-\r
-        // Compute sampling points\r
-        // since grids are 2D, need to compute xBlock and yBlock indices\r
-        const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4\r
-        const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
-        const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
-        const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
-\r
-        calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);\r
-    }\r
-\r
-    // Spawn 16 blocks per interest point\r
-    // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
-    __global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)\r
-    {\r
         // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+        __shared__ float s_PATCH[6][6];\r
         __shared__ float sdx[25];\r
         __shared__ float sdy[25];\r
         __shared__ float sdxabs[25];\r
@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf
 \r
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-        calc_dx_dy_old(sdx, sdy, features, tid);\r
+        calc_dx_dy(s_PATCH, sdx, sdy, features, tid);\r
         __syncthreads();\r
 \r
         sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf
 \r
     // Spawn 16 blocks per interest point\r
     // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
-    __global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
     {\r
         // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+        __shared__ float s_PATCH[6][6];\r
         __shared__ float sdx[25];\r
         __shared__ float sdy[25];\r
 \r
@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf
 \r
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-        calc_dx_dy_old(sdx, sdy, features, tid);\r
+        calc_dx_dy(s_PATCH, sdx, sdy, features, tid);\r
         __syncthreads();\r
 \r
         if (sdy[tid] >= 0)\r
@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf
         }\r
         __syncthreads();\r
 \r
-        reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);\r
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
         __syncthreads();\r
 \r
         float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf
         }\r
         __syncthreads();\r
 \r
-        reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);\r
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
         __syncthreads();\r
 \r
         // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf
         }\r
     }\r
 \r
-    void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)\r
+    template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)\r
+    {\r
+        // no need for thread ID\r
+        float* descriptor_base = descriptors.ptr(blockIdx.x);\r
+\r
+        // read in the unnormalized descriptor values (squared)\r
+        __shared__ float sqDesc[BLOCK_DIM_X];\r
+        const float lookup = descriptor_base[threadIdx.x];\r
+        sqDesc[threadIdx.x] = lookup * lookup;\r
+        __syncthreads();\r
+\r
+        if (BLOCK_DIM_X >= 128)\r
+        {\r
+            if (threadIdx.x < 64)\r
+                sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
+            __syncthreads();\r
+        }\r
+\r
+        // reduction to get total\r
+        if (threadIdx.x < 32)\r
+        {\r
+            volatile float* smem = sqDesc;\r
+\r
+            smem[threadIdx.x] += smem[threadIdx.x + 32];\r
+            smem[threadIdx.x] += smem[threadIdx.x + 16];\r
+            smem[threadIdx.x] += smem[threadIdx.x + 8];\r
+            smem[threadIdx.x] += smem[threadIdx.x + 4];\r
+            smem[threadIdx.x] += smem[threadIdx.x + 2];\r
+            smem[threadIdx.x] += smem[threadIdx.x + 1];\r
+        }\r
+\r
+        // compute length (square root)\r
+        __shared__ float len;\r
+        if (threadIdx.x == 0)\r
+        {\r
+            len = sqrtf(sqDesc[0]);\r
+        }\r
+        __syncthreads();\r
+\r
+        // normalize and store in output\r
+        descriptor_base[threadIdx.x] = lookup / len;\r
+    }\r
+\r
+    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)\r
     {\r
         // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
         \r
         if (descriptors.cols == 64)\r
         {\r
-            compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);\r
+            compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);\r
             cudaSafeCall( cudaGetLastError() );\r
 \r
             cudaSafeCall( cudaThreadSynchronize() );\r
@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf
         }\r
         else\r
         {\r
-            compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);            \r
+            compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);            \r
             cudaSafeCall( cudaGetLastError() );\r
 \r
             cudaSafeCall( cudaThreadSynchronize() );\r
index c16de76..ffdaabb 100644 (file)
@@ -48,123 +48,93 @@ using namespace std;
 \r
 #if !defined (HAVE_CUDA)\r
 \r
+cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }\r
+cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float) { throw_nogpu(); }\r
 int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}\r
 void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }\r
 void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }\r
 void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }\r
 void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }\r
 void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); }\r
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }\r
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }\r
 \r
 #else /* !defined (HAVE_CUDA) */\r
 \r
 namespace cv { namespace gpu { namespace surf\r
 {\r
-    dim3 calcBlockSize(int nIntervals);\r
-    \r
-    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
-    void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);\r
-    \r
-    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
-        int x_size, int y_size, bool use_mask, const dim3& threads);\r
-    \r
-    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, \r
-        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);\r
-    \r
-    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);\r
-    \r
+    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);\r
+\r
+    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
+        int img_rows, int img_cols, int octave, bool use_mask, int nLayers);\r
+\r
+    void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter);\r
+\r
+    void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter);\r
+\r
     void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);\r
-    void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);\r
 }}}\r
 \r
 using namespace cv::gpu::surf;\r
 \r
 namespace\r
 {\r
-    class SURF_GPU_Invoker : private SURFParams_GPU\r
+    class SURF_GPU_Invoker : private CvSURFParams\r
     {\r
     public:\r
-        SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : \r
-            SURFParams_GPU(surf),\r
+        SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :\r
+            CvSURFParams(surf),\r
 \r
-            sum(surf.sum), sumf(surf.sumf),\r
+            sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace),\r
 \r
-            mask1(surf.mask1), maskSum(surf.maskSum),\r
-\r
-            hessianBuffer(surf.hessianBuffer), \r
-            maxPosBuffer(surf.maxPosBuffer), \r
-            featuresBuffer(surf.featuresBuffer),\r
+            maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer),\r
 \r
             img_cols(img.cols), img_rows(img.rows),\r
 \r
-            use_mask(!mask.empty()),\r
-\r
-            mask_width(0), mask_height(0),\r
-\r
-            featureCounter(0), maxCounter(0)\r
+            use_mask(!mask.empty())\r
         {\r
             CV_Assert(!img.empty() && img.type() == CV_8UC1);\r
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));\r
-            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);\r
-            CV_Assert(DeviceInfo().supports(GLOBAL_ATOMICS));\r
-\r
-            max_features = static_cast<int>(img.size().area() * featuresRatio);\r
-            max_candidates = static_cast<int>(1.5 * max_features);\r
-\r
-            CV_Assert(max_features > 0);\r
+            CV_Assert(nOctaves > 0 && nOctaveLayers > 0);\r
+            CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));\r
 \r
-            featuresBuffer.create(1, max_features, CV_32FC(6));\r
-            maxPosBuffer.create(1, max_candidates, CV_32SC4);\r
+            maxKeypoints = static_cast<int>(img.size().area() * surf.keypointsRatio);\r
+            maxFeatures = static_cast<int>(1.5 * maxKeypoints);\r
+            maxCandidates = static_cast<int>(1.5 * maxFeatures);\r
 \r
-            mask_width = l2 * 0.5f;\r
-            mask_height = 1.0f + l1;\r
+            CV_Assert(maxKeypoints > 0);\r
+            \r
+            cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) );\r
+            cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) );\r
 \r
-            // Dxy gap half-width\r
-            float dxy_center_offset = 0.5f * (l4 + l3);\r
-            // Dxy squares half-width\r
-            float dxy_half_width = 0.5f * l3;\r
+            uploadConstant("cv::gpu::surf::c_max_candidates",    maxCandidates);\r
+            uploadConstant("cv::gpu::surf::c_max_features",      maxFeatures);\r
+            uploadConstant("cv::gpu::surf::c_max_keypoints",     maxKeypoints);\r
+            uploadConstant("cv::gpu::surf::c_img_rows",          img_rows);\r
+            uploadConstant("cv::gpu::surf::c_img_cols",          img_cols);\r
+            uploadConstant("cv::gpu::surf::c_nOctaveLayers",     nOctaveLayers);\r
+            uploadConstant("cv::gpu::surf::c_hessianThreshold",  static_cast<float>(hessianThreshold));\r
 \r
-            // rescale edge_scale to fit with the filter dimensions\r
-            float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f);\r
-            \r
-            // Compute border required such that the filters don't overstep the image boundaries               \r
-               float smax0 = 2.0f * initialScale + 0.5f;\r
-            int border0 = static_cast<int>(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));\r
-\r
-            int width0 = (img_cols - 2 * border0) / initialStep;\r
-            int height0 = (img_rows - 2 * border0) / initialStep;\r
-\r
-            uploadConstant("cv::gpu::surf::c_max_candidates",    max_candidates);\r
-            uploadConstant("cv::gpu::surf::c_max_features",      max_features);\r
-            uploadConstant("cv::gpu::surf::c_nIntervals",        nIntervals);\r
-            uploadConstant("cv::gpu::surf::c_mask_width",        mask_width);\r
-            uploadConstant("cv::gpu::surf::c_mask_height",       mask_height);\r
-            uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset);\r
-            uploadConstant("cv::gpu::surf::c_dxy_half_width",    dxy_half_width);\r
-            uploadConstant("cv::gpu::surf::c_dxy_scale",         dxy_scale);\r
-            uploadConstant("cv::gpu::surf::c_initialScale",      initialScale);\r
-            uploadConstant("cv::gpu::surf::c_threshold",         threshold);\r
-            \r
-            hessianBuffer.create(height0 * nIntervals, width0, CV_32F);\r
+            bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img);\r
 \r
-            integral(img, sum);\r
-            sum.convertTo(sumf, CV_32F, 1.0 / 255.0);\r
-            \r
-            bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf);\r
+            integralBuffered(img, sum, intBuffer);\r
+            bindTexture("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);\r
 \r
-            if (!mask.empty())\r
-                   {\r
+            if (use_mask)\r
+            {\r
                 min(mask, 1.0, mask1);\r
-                integral(mask1, maskSum);\r
-            \r
-                bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum);\r
-                   }\r
+                integralBuffered(mask1, maskSum, intBuffer);\r
+\r
+                bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);\r
+            }\r
         }\r
 \r
         ~SURF_GPU_Invoker()\r
         {\r
+            cudaSafeCall( cudaFree(d_counters) );\r
+\r
+            unbindTexture("cv::gpu::surf::imgTex");\r
             unbindTexture("cv::gpu::surf::sumTex");\r
             if (use_mask)\r
                 unbindTexture("cv::gpu::surf::maskSumTex");\r
@@ -172,102 +142,115 @@ namespace
 \r
         void detectKeypoints(GpuMat& keypoints)\r
         {\r
-            typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
-            const fasthessian_t fasthessian = \r
-                DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;\r
+            ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det);\r
+            ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace);\r
+            \r
+            ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer);\r
+            ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer);\r
 \r
-            dim3 threads = calcBlockSize(nIntervals);\r
-            for(int octave = 0; octave < nOctaves; ++octave)\r
+            for (int octave = 0; octave < nOctaves; ++octave)\r
             {\r
-                int step = initialStep * (1 << octave);\r
-\r
-                // Compute border required such that the filters don't overstep the image boundaries\r
-                float d = (initialScale * (1 << octave)) / (nIntervals - 2);\r
-                   float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f;\r
-                int border = static_cast<int>(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));\r
-                \r
-                int x_size = (img_cols - 2 * border) / step;\r
-                int y_size = (img_rows - 2 * border) / step;\r
-                \r
-                if (x_size <= 0 || y_size <= 0)\r
-                    break;\r
-\r
-                uploadConstant("cv::gpu::surf::c_octave", octave);\r
-                uploadConstant("cv::gpu::surf::c_x_size", x_size);\r
-                uploadConstant("cv::gpu::surf::c_y_size", y_size);\r
-                uploadConstant("cv::gpu::surf::c_border", border);\r
-                uploadConstant("cv::gpu::surf::c_step",   step);\r
-\r
-                fasthessian(hessianBuffer, x_size, y_size, threads);\r
-\r
-                // Reset the candidate count.\r
-                maxCounter = 0;\r
-\r
-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); \r
-                \r
-                maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));\r
+                const int layer_rows = img_rows >> octave;\r
+                const int layer_cols = img_cols >> octave;\r
+\r
+                uploadConstant("cv::gpu::surf::c_octave",     octave);\r
+                uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);\r
+                uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);\r
+\r
+                icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);\r
+\r
+                icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 2 + octave,\r
+                    img_rows, img_cols, octave, use_mask, nOctaveLayers);\r
+\r
+                unsigned int maxCounter;\r
+                cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+                maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));\r
 \r
                 if (maxCounter > 0)\r
                 {\r
-                    fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,\r
-                        featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);\r
-\r
-                    featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));\r
+                    icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr<int4>(), maxCounter, \r
+                        featuresBuffer.ptr<KeyPoint_GPU>(), d_counters);\r
                 }\r
             }\r
+            unsigned int featureCounter;\r
+            cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+            featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));\r
 \r
-            if (featureCounter > 0)\r
-                featuresBuffer.colRange(0, featureCounter).copyTo(keypoints);\r
-            else\r
-                keypoints.release();\r
+            findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints);\r
         }\r
 \r
-        void findOrientation(GpuMat& keypoints)\r
+        void findOrientation(const GpuMat& features, GpuMat& keypoints)\r
         {\r
-            if (keypoints.cols > 0)\r
-                find_orientation_gpu(keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);\r
+            if (features.cols > 0)\r
+            {\r
+                ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer);\r
+\r
+                icvCalcOrientation_gpu(features.ptr<KeyPoint_GPU>(), features.cols, keypointsBuffer.ptr<KeyPoint_GPU>(), \r
+                    d_counters + 1);\r
+\r
+                unsigned int keypointsCounter;\r
+                cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+                keypointsCounter = std::min(keypointsCounter, static_cast<unsigned int>(maxKeypoints));\r
+\r
+                if (keypointsCounter > 0)\r
+                    keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints);\r
+                else\r
+                    keypoints.release();\r
+            }\r
         }\r
 \r
         void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)\r
         {\r
-            typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors, \r
-                const KeyPoint_GPU* features, int nFeatures);\r
-\r
-            const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old;\r
-                //DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;\r
-\r
             if (keypoints.cols > 0)\r
             {\r
                 descriptors.create(keypoints.cols, descriptorSize, CV_32F);\r
-                compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);\r
+                compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);\r
             }\r
         }\r
 \r
     private:\r
         GpuMat& sum;\r
-        GpuMat& sumf;\r
-\r
         GpuMat& mask1;\r
         GpuMat& maskSum;\r
+        GpuMat& intBuffer;\r
+\r
+        GpuMat& det;\r
+        GpuMat& trace;\r
 \r
-        GpuMat& hessianBuffer;\r
         GpuMat& maxPosBuffer;\r
         GpuMat& featuresBuffer;\r
+        GpuMat& keypointsBuffer;\r
 \r
         int img_cols, img_rows;\r
 \r
         bool use_mask;\r
-        \r
-        float mask_width, mask_height;\r
 \r
-        unsigned int featureCounter;\r
-        unsigned int maxCounter;\r
+        int maxCandidates;\r
+        int maxFeatures;\r
+        int maxKeypoints;\r
 \r
-        int max_candidates;\r
-        int max_features;\r
+        unsigned int* d_counters;\r
     };\r
 }\r
 \r
+cv::gpu::SURF_GPU::SURF_GPU()\r
+{\r
+    hessianThreshold = 100;\r
+    extended = 1;\r
+    nOctaves = 4;\r
+    nOctaveLayers = 2;\r
+    keypointsRatio = 0.01f;\r
+}\r
+\r
+cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio)\r
+{\r
+    hessianThreshold = _threshold;\r
+    extended = _extended;\r
+    nOctaves = _nOctaves;\r
+    nOctaveLayers = _nOctaveLayers;\r
+    keypointsRatio = _keypointsRatio;\r
+}\r
+\r
 int cv::gpu::SURF_GPU::descriptorSize() const\r
 {\r
     return extended ? 128 : 64;\r
@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
     {\r
         Mat keypointsCPU(1, keypoints.size(), CV_32FC(6));\r
 \r
-        const KeyPoint* keypoints_ptr = &keypoints[0];\r
-        KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();\r
-        for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr)\r
+        for (size_t i = 0; i < keypoints.size(); ++i)\r
         {\r
-            const KeyPoint& kp = *keypoints_ptr;\r
-            KeyPoint_GPU& gkp = *keypointsCPU_ptr;\r
+            const KeyPoint& kp = keypoints[i];\r
+            KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];\r
 \r
             gkp.x = kp.pt.x;\r
             gkp.y = kp.pt.y;\r
 \r
+            gkp.laplacian = 1.0f;\r
+\r
             gkp.size = kp.size;\r
 \r
-            gkp.octave = static_cast<float>(kp.octave);\r
-            gkp.angle = kp.angle;\r
-            gkp.response = kp.response;\r
+            gkp.dir = kp.angle;\r
+            gkp.hessian = kp.response;\r
         }\r
 \r
         keypointsGPU.upload(keypointsCPU);\r
     }\r
 }\r
 \r
+namespace\r
+{\r
+    int calcSize(int octave, int layer)\r
+    {\r
+        /* Wavelet size at first layer of first octave. */\r
+        const int HAAR_SIZE0 = 9;\r
+\r
+        /* Wavelet size increment between layers. This should be an even number,\r
+         such that the wavelet sizes in an octave are either all even or all odd.\r
+         This ensures that when looking for the neighbours of a sample, the layers\r
+         above and below are aligned correctly. */\r
+        const int HAAR_SIZE_INC = 6;\r
+\r
+        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;\r
+    }\r
+\r
+    int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params)\r
+    {\r
+        int best_octave = 0;\r
+        float min_diff = numeric_limits<float>::max();\r
+        for (int octave = 1; octave < params.nOctaves; ++octave)\r
+        {\r
+            for (int layer = 0; layer < params.nOctaveLayers; ++layer)\r
+            {\r
+                float diff = std::abs(kpt.size - (float)calcSize(octave, layer));\r
+                if (min_diff > diff)\r
+                {\r
+                    min_diff = diff;\r
+                    best_octave = octave;\r
+                    if (min_diff == 0)\r
+                        return best_octave;\r
+                }\r
+            }\r
+        }\r
+        return best_octave;\r
+    }\r
+}\r
+\r
 void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)\r
 {\r
     if (keypointsGPU.empty())\r
@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
         Mat keypointsCPU = keypointsGPU;\r
         keypoints.resize(keypointsGPU.cols);\r
 \r
-        KeyPoint* keypoints_ptr = &keypoints[0];\r
-        const KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();\r
-        for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr)\r
+        for (int i = 0; i < keypointsGPU.cols; ++i)\r
         {\r
-            KeyPoint& kp = *keypoints_ptr;\r
-            const KeyPoint_GPU& gkp = *keypointsCPU_ptr;\r
+            KeyPoint& kp = keypoints[i];\r
+            const KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];\r
 \r
             kp.pt.x = gkp.x;\r
             kp.pt.y = gkp.y;\r
 \r
             kp.size = gkp.size;\r
 \r
-            kp.octave = static_cast<int>(gkp.octave);\r
-            kp.angle = gkp.angle;\r
-            kp.response = gkp.response;\r
+            kp.angle = gkp.dir;\r
+\r
+            kp.response = gkp.hessian;\r
+\r
+            kp.octave = getPointOctave(gkp, *this);\r
+\r
+            kp.class_id = static_cast<int>(gkp.laplacian);\r
         }\r
     }\r
 }\r
@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
         SURF_GPU_Invoker surf(*this, img, mask);\r
 \r
         surf.detectKeypoints(keypoints);\r
-\r
-        surf.findOrientation(keypoints);\r
     }\r
 }\r
 \r
 void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, \r
-                                   bool useProvidedKeypoints, bool calcOrientation)\r
+                                   bool useProvidedKeypoints)\r
 {\r
     if (!img.empty())\r
     {\r
         SURF_GPU_Invoker surf(*this, img, mask);\r
-        \r
+    \r
         if (!useProvidedKeypoints)\r
             surf.detectKeypoints(keypoints);\r
-        \r
-        if (calcOrientation)\r
-            surf.findOrientation(keypoints);\r
+        else\r
+        {\r
+            GpuMat keypointsBuf;\r
+            surf.findOrientation(keypoints, keypointsBuf);\r
+            keypointsBuf.copyTo(keypoints);\r
+        }\r
 \r
         surf.computeDescriptors(keypoints, descriptors, descriptorSize());\r
     }\r
@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
 }\r
 \r
 void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, \r
-    GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation)\r
+    GpuMat& descriptors, bool useProvidedKeypoints)\r
 {\r
     GpuMat keypointsGPU;\r
 \r
     if (useProvidedKeypoints)\r
         uploadKeypoints(keypoints, keypointsGPU);    \r
 \r
-    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation);\r
+    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);\r
 \r
     downloadKeypoints(keypointsGPU, keypoints);\r
 }\r
 \r
 void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, \r
-    vector<float>& descriptors, bool useProvidedKeypoints, bool calcOrientation)\r
+    vector<float>& descriptors, bool useProvidedKeypoints)\r
 {\r
     GpuMat descriptorsGPU;\r
 \r
-    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation);\r
+    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);\r
 \r
     downloadDescriptors(descriptorsGPU, descriptors);\r
 }\r
index c0f254e..12f7196 100644 (file)
@@ -48,7 +48,6 @@ using namespace std;
 \r
 const string FEATURES2D_DIR = "features2d";\r
 const string IMAGE_FILENAME = "aloe.png";\r
-const string VALID_FILE_NAME = "surf.xml.gz";\r
 \r
 class CV_GPU_SURFTest : public cvtest::BaseTest\r
 {\r
@@ -59,17 +58,20 @@ public:
 \r
 protected:\r
     bool isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2);\r
+    int getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2, const vector<DMatch>& matches);\r
     void compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,\r
                              const Mat& validDescriptors, const Mat& calcDescriptors);\r
 \r
-    void emptyDataTest(SURF_GPU& fdetector);\r
-    void regressionTest(SURF_GPU& fdetector);\r
+    void emptyDataTest();\r
+    void accuracyTest();\r
 \r
     virtual void run(int);\r
 };\r
 \r
-void CV_GPU_SURFTest::emptyDataTest(SURF_GPU& fdetector)\r
+void CV_GPU_SURFTest::emptyDataTest()\r
 {\r
+    SURF_GPU fdetector;\r
+\r
     GpuMat image;\r
     vector<KeyPoint> keypoints;\r
     vector<float> descriptors;\r
@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2)
             p1.class_id == p2.class_id );\r
 }\r
 \r
-void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints, \r
-                                          const Mat& validDescriptors, const Mat& calcDescriptors)\r
+int CV_GPU_SURFTest::getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2,\r
+                     const vector<DMatch>& matches)\r
 {\r
-    if (validKeypoints.size() != calcKeypoints.size())\r
+    int count = 0;\r
+\r
+    for (size_t i = 0; i < matches.size(); ++i)\r
     {\r
-        ts->printf(cvtest::TS::LOG, "Keypoints sizes doesn't equal (validCount = %d, calcCount = %d).\n",\r
-                   validKeypoints.size(), calcKeypoints.size());\r
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);\r
-        return;\r
+        const DMatch& m = matches[i];\r
+\r
+        const KeyPoint& kp1 = keypoints1[m.queryIdx];\r
+        const KeyPoint& kp2 = keypoints2[m.trainIdx];\r
+\r
+        if (isSimilarKeypoints(kp1, kp2))\r
+            ++count;\r
     }\r
-    if (validDescriptors.size() != calcDescriptors.size())\r
+\r
+    return count;\r
+}\r
+\r
+void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints, \r
+                                          const Mat& validDescriptors, const Mat& calcDescriptors)\r
+{\r
+    BruteForceMatcher< L2<float> > matcher;\r
+    vector<DMatch> matches;\r
+\r
+    matcher.match(validDescriptors, calcDescriptors, matches);\r
+\r
+    int validCount = getValidCount(validKeypoints, calcKeypoints, matches);\r
+    float validRatio = (float)validCount / matches.size();\r
+\r
+    if (validRatio < 0.5f)\r
     {\r
-        ts->printf(cvtest::TS::LOG, "Descriptors sizes doesn't equal.\n");\r
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);\r
+        ts->printf(cvtest::TS::LOG, "Bad accuracy - %f.\n", validRatio);\r
+        ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );\r
         return;\r
     }\r
-    for (size_t v = 0; v < validKeypoints.size(); v++)\r
-    {\r
-        int nearestIdx = -1;\r
-        float minDist = std::numeric_limits<float>::max();\r
-\r
-        for (size_t c = 0; c < calcKeypoints.size(); c++)\r
-        {\r
-            float curDist = (float)norm(calcKeypoints[c].pt - validKeypoints[v].pt);\r
-            if (curDist < minDist)\r
-            {\r
-                minDist = curDist;\r
-                nearestIdx = c;\r
-            }\r
-        }\r
-\r
-        assert(minDist >= 0);\r
-        if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))\r
-        {\r
-            ts->printf(cvtest::TS::LOG, "Bad keypoints accuracy.\n");\r
-            ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );\r
-            return;\r
-        }\r
-\r
-        if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)\r
-        {\r
-            ts->printf(cvtest::TS::LOG, "Bad descriptors accuracy.\n");\r
-            ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );\r
-            return;\r
-        }\r
-    }\r
 }\r
 \r
-void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)\r
+void CV_GPU_SURFTest::accuracyTest()\r
 {\r
     string imgFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME;\r
-    string resFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + VALID_FILE_NAME;\r
 \r
     // Read the test image.\r
-    GpuMat image(imread(imgFilename, 0));\r
+    Mat image = imread(imgFilename, 0);\r
     if (image.empty())\r
     {\r
         ts->printf( cvtest::TS::LOG, "Image %s can not be read.\n", imgFilename.c_str() );\r
         ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA );\r
         return;\r
     }\r
-\r
-    FileStorage fs(resFilename, FileStorage::READ);\r
+    \r
+    Mat mask(image.size(), CV_8UC1, Scalar::all(1));\r
+    mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));\r
 \r
     // Compute keypoints.\r
-    GpuMat mask(image.size(), CV_8UC1, Scalar::all(1));\r
-    mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));\r
     vector<KeyPoint> calcKeypoints;\r
-    GpuMat calcDespcriptors;\r
-    fdetector(image, mask, calcKeypoints, calcDespcriptors);\r
-\r
-    if (fs.isOpened()) // Compare computed and valid keypoints.\r
-    {\r
-        // Read validation keypoints set.\r
-        vector<KeyPoint> validKeypoints;\r
-        Mat validDespcriptors;\r
-        read(fs["keypoints"], validKeypoints);\r
-        read(fs["descriptors"], validDespcriptors);\r
-        if (validKeypoints.empty() || validDespcriptors.empty())\r
-        {\r
-            ts->printf(cvtest::TS::LOG, "Validation file can not be read.\n");\r
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);\r
-            return;\r
-        }\r
-\r
-        compareKeypointSets(validKeypoints, calcKeypoints, validDespcriptors, calcDespcriptors);\r
-    }\r
-    else // Write detector parameters and computed keypoints as validation data.\r
-    {\r
-        fs.open(resFilename, FileStorage::WRITE);\r
-        if (!fs.isOpened())\r
-        {\r
-            ts->printf(cvtest::TS::LOG, "File %s can not be opened to write.\n", resFilename.c_str());\r
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);\r
-            return;\r
-        }\r
-        else\r
-        {\r
-            write(fs, "keypoints", calcKeypoints);\r
-            write(fs, "descriptors", (Mat)calcDespcriptors);\r
-        }\r
-    }\r
+    GpuMat calcDescriptors;\r
+    SURF_GPU fdetector; fdetector.extended = false;\r
+    fdetector(GpuMat(image), GpuMat(mask), calcKeypoints, calcDescriptors);\r
+\r
+    // Calc validation keypoints set.\r
+    vector<KeyPoint> validKeypoints;\r
+    vector<float> validDescriptors;\r
+    SURF fdetector_gold; fdetector_gold.extended = false;\r
+    fdetector_gold(image, mask, validKeypoints, validDescriptors);\r
+\r
+    compareKeypointSets(validKeypoints, calcKeypoints, \r
+        Mat(validKeypoints.size(), fdetector_gold.descriptorSize(), CV_32F, &validDescriptors[0]), calcDescriptors);\r
 }\r
 \r
 void CV_GPU_SURFTest::run( int /*start_from*/ )\r
 {\r
-    SURF_GPU fdetector;\r
-\r
-    emptyDataTest(fdetector);\r
-    regressionTest(fdetector);\r
+    emptyDataTest();\r
+    accuracyTest();\r
 }\r
 \r
-TEST(SURF, empty_data_and_regression) { CV_GPU_SURFTest test; test.safe_run(); }\r
+TEST(SURF, empty_data_and_accuracy) { CV_GPU_SURFTest test; test.safe_run(); }\r
index 34af5ae..18b45f2 100644 (file)
@@ -264,10 +264,11 @@ TEST(SURF)
 \r
     SURF surf;\r
     vector<KeyPoint> keypoints1, keypoints2;\r
+    vector<float> descriptors1, descriptors2;\r
 \r
     CPU_ON;\r
-    surf(src1, Mat(), keypoints1);\r
-    surf(src2, Mat(), keypoints2);\r
+    surf(src1, Mat(), keypoints1, descriptors1);\r
+    surf(src2, Mat(), keypoints2, descriptors2);\r
     CPU_OFF;\r
 \r
     gpu::SURF_GPU d_surf;\r
@@ -275,8 +276,8 @@ TEST(SURF)
     gpu::GpuMat d_descriptors1, d_descriptors2;\r
 \r
     GPU_ON;\r
-    d_surf(d_src1, gpu::GpuMat(), d_keypoints1);\r
-    d_surf(d_src2, gpu::GpuMat(), d_keypoints2);\r
+    d_surf(d_src1, gpu::GpuMat(), d_keypoints1, d_descriptors1);\r
+    d_surf(d_src2, gpu::GpuMat(), d_keypoints2, d_descriptors2);\r
     GPU_OFF;\r
 }\r
 \r
index b2c9385..eb8da5f 100644 (file)
@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
     vector<KeyPoint> keypoints1, keypoints2;
     vector<float> descriptors1, descriptors2;
     vector<DMatch> matches;
-    SURF_GPU::downloadKeypoints(keypoints1GPU, keypoints1);
-    SURF_GPU::downloadKeypoints(keypoints2GPU, keypoints2);
-    SURF_GPU::downloadDescriptors(descriptors1GPU, descriptors1);
-    SURF_GPU::downloadDescriptors(descriptors2GPU, descriptors2);
+    surf.downloadKeypoints(keypoints1GPU, keypoints1);
+    surf.downloadKeypoints(keypoints2GPU, keypoints2);
+    surf.downloadDescriptors(descriptors1GPU, descriptors1);
+    surf.downloadDescriptors(descriptors2GPU, descriptors2);
     BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);
 
     // drawing the results