From: Vladislav Vinogradov <no@email>
Date: Thu, 10 Mar 2011 13:53:58 +0000 (+0000)
Subject: made GPU version of SURF more consistent with CPU one
X-Git-Tag: accepted/2.0/20130307.220821~3427
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=58f691979537d4beeee6a6980393c2b822d24f76;p=profile%2Fivi%2Fopencv.git

made GPU version of SURF more consistent with CPU one
---

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 33fa3b4..854b89d 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1537,83 +1537,55 @@ namespace cv
         };
         
         ////////////////////////////////// SURF //////////////////////////////////////////
-        
-        struct CV_EXPORTS SURFParams_GPU 
-        {
-            SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), 
-                l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),
-                edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}
-
-            //! The interest operator threshold
-            float threshold;
-            //! The number of octaves to process
-            int nOctaves;
-            //! The number of intervals in each octave
-            int nIntervals;
-            //! The scale associated with the first interval of the first octave
-            float initialScale;
-
-            //! mask parameter l_1
-            float l1;
-            //! mask parameter l_2 
-            float l2;
-            //! mask parameter l_3
-            float l3;
-            //! mask parameter l_4
-            float l4;
-            //! The amount to scale the edge rejection mask
-            float edgeScale;
-            //! The initial sampling step in pixels.
-            int initialStep;
-
-            //! True, if generate 128-len descriptors, false - 64-len descriptors
-            bool extended;
-
-            //! max features = featuresRatio * img.size().srea()
-            float featuresRatio;
-        };
 
-        class CV_EXPORTS SURF_GPU : public SURFParams_GPU
+        class CV_EXPORTS SURF_GPU : public CvSURFParams
         {
         public:
+            //! the default constructor
+            SURF_GPU();
+            //! the full constructor taking all the necessary parameters
+            explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
+                 int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
+
             //! returns the descriptor size in float's (64 or 128)
             int descriptorSize() const;
 
             //! upload host keypoints to device memory
-            static void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
+            void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
             //! download keypoints from device to host memory
-            static void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
+            void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
 
             //! download descriptors from device to host memory
-            static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
+            void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
             
             //! finds the keypoints using fast hessian detector used in SURF
             //! supports CV_8UC1 images
             //! keypoints will have 1 row and type CV_32FC(6)
             //! keypoints.at<float[6]>(1, i) contains i'th keypoint
-            //! format: (x, y, size, response, angle, octave)
+            //! format: (x, y, laplacian, size, dir, hessian)
             void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
             //! finds the keypoints and computes their descriptors. 
             //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
             void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, 
-                bool useProvidedKeypoints = false, bool calcOrientation = true);
+                bool useProvidedKeypoints = false);
         
             void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
             void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, 
-                bool useProvidedKeypoints = false, bool calcOrientation = true);
+                bool useProvidedKeypoints = false);
             
             void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, 
-                bool useProvidedKeypoints = false, bool calcOrientation = true);
+                bool useProvidedKeypoints = false);
+
+            //! max keypoints = keypointsRatio * img.size().area()
+            float keypointsRatio;
 
-            GpuMat sum;
-            GpuMat sumf;
+            GpuMat sum, mask1, maskSum, intBuffer;
 
-            GpuMat mask1;
-            GpuMat maskSum;
+            GpuMat det, trace;
 
-            GpuMat hessianBuffer;
             GpuMat maxPosBuffer;
             GpuMat featuresBuffer;
+            GpuMat keypointsBuffer;
         };
 
     }
diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp
index e82700e..d2428c6 100644
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -111,20 +111,20 @@ namespace cv
         {
             float x;
             float y;
+            float laplacian;
             float size;
-            float response;
-            float angle;
-            float octave;
+            float dir;
+            float hessian;
         };
 
         enum KeypointLayout 
         {
             SF_X,
             SF_Y,
+            SF_LAPLACIAN,
             SF_SIZE,
-            SF_RESPONSE,
-            SF_ANGLE,
-            SF_OCTAVE,
+            SF_DIR,
+            SF_HESSIAN,
             SF_FEATURE_STRIDE
         };
     }
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index d8abf1b..bd2a1d9 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -56,267 +56,132 @@ using namespace cv::gpu::device;
 namespace cv { namespace gpu { namespace surf
 {
     ////////////////////////////////////////////////////////////////////////
-    // Help funcs
-
-    // Wrapper for host reference to pass into kernel
-    template <typename T> 
-    class DeviceReference
-    {
-    public:
-        explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val)
-        {
-            cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) );
-            cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) );
-        }
-
-        ~DeviceReference()
-        {            
-            cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) );
-            cudaSafeCall( cudaFree(d_ptr) );
-        }
-
-        // Casting to device pointer
-        operator T*() {return d_ptr;}
-        operator const T*() const {return d_ptr;}
-    private:
-        T* d_ptr;
-        T* h_ptr;
-    };
-
-    __device__ void clearLastBit(int* f)
-    {
-        *f &= ~0x1;
-    }
-    __device__ void clearLastBit(float& f)
-    {
-        clearLastBit((int*)&f);
-    }
-
-    __device__ void setLastBit(int* f)
-    {
-        *f |= 0x1;
-    }
-    __device__ void setLastBit(float& f)
-    {
-        setLastBit((int*)&f);
-    }
-
-    ////////////////////////////////////////////////////////////////////////
     // Global parameters
 
     // The maximum number of features (before subpixel interpolation) that memory is reserved for.
     __constant__ int c_max_candidates;
     // The maximum number of features that memory is reserved for.
     __constant__ int c_max_features;
-    // The number of intervals in the octave.
-    __constant__ int c_nIntervals;
-    // Mask sizes derived from the mask parameters
-    __constant__ float c_mask_width;
-    // Mask sizes derived from the mask parameters
-    __constant__ float c_mask_height;
-    // Mask sizes derived from the mask parameters
-    __constant__ float c_dxy_center_offset;
-    // Mask sizes derived from the mask parameters
-    __constant__ float c_dxy_half_width;
-    // Mask sizes derived from the mask parameters
-    __constant__ float c_dxy_scale;
-    // The scale associated with the first interval of the first octave
-    __constant__ float c_initialScale;
-    // The interest operator threshold
-    __constant__ float c_threshold;
-
-    // Ther octave
+    // The maximum number of keypoints that memory is reserved for.
+    __constant__ int c_max_keypoints;
+    // The image size.
+    __constant__ int c_img_rows;
+    __constant__ int c_img_cols;
+    // The number of layers.
+    __constant__ int c_nOctaveLayers;
+    // The hessian threshold.
+    __constant__ float c_hessianThreshold;
+
+    // The current octave.
     __constant__ int c_octave;
-    // The width of the octave buffer.
-    __constant__ int c_x_size;
-    // The height of the octave buffer.
-    __constant__ int c_y_size;
-    // The size of the octave border in pixels.
-    __constant__ int c_border;
-    // The step size used in this octave in pixels.
-    __constant__ int c_step;
+    // The current layer size.
+    __constant__ int c_layer_rows;
+    __constant__ int c_layer_cols;
 
     ////////////////////////////////////////////////////////////////////////
     // Integral image texture
 
-    texture<float, 2, cudaReadModeElementType> sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp);
+    typedef texture<unsigned int, 2, cudaReadModeElementType> IntTex;
 
-    __device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight)
-    {
-        float result = 0.f;
-
-        result += tex2D(sumTex, cx - halfWidth, cy - halfHeight);
-        result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight);
-        result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight);
-        result += tex2D(sumTex, cx + halfWidth, cy + halfHeight);
-
-        return result;
-    }
-
-    ////////////////////////////////////////////////////////////////////////
-    // Hessian
+    IntTex sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 
-    __device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale)
+    template <int N>
+    __device__ float icvCalcHaarPattern(const IntTex& tex, const float src[][5], int oldSize, int newSize, int y, int x)
     {
-        float Dyy = 0.f;
-
-        Dyy +=     iiAreaLookupCDHalfWH(x, y, mask_width, mask_height);
-        Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale);
-
-        Dyy *=  1.0f / (fscale * fscale);
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
+        typedef double real_t;        
+        #else
+        typedef float  real_t;
+        #endif
 
-        return Dyy;
-    }
+        float ratio = (float)newSize / oldSize;
+        
+        real_t d = 0;
 
-    __device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale)
-    {
-        float Dxx = 0.f;
+        #pragma unroll
+        for (int k = 0; k < N; ++k)
+        {
+            int dx1 = __float2int_rn(ratio * src[k][0]);
+            int dy1 = __float2int_rn(ratio * src[k][1]);
+            int dx2 = __float2int_rn(ratio * src[k][2]);
+            int dy2 = __float2int_rn(ratio * src[k][3]);
 
-        Dxx +=     iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);
-        Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale     , mask_width);
+            real_t t = 0;
+            t += tex2D(tex, x + dx1, y + dy1);
+            t -= tex2D(tex, x + dx1, y + dy2);
+            t -= tex2D(tex, x + dx2, y + dy1);
+            t += tex2D(tex, x + dx2, y + dy2);
 
-        Dxx *=  1.0f / (fscale * fscale);
+            d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
+        }
 
-        return Dxx;
+        return (float)d;
     }
 
-    __device__ float evalDxy(float x, float y, float fscale)
-    {
-        float center_offset =  c_dxy_center_offset  * fscale;
-        float half_width    =  c_dxy_half_width     * fscale;
-
-        float Dxy = 0.f;
-
-        Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);
-        Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);
-        Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);
-        Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);
-
-        Dxy *= 1.0f / (fscale * fscale);
+    ////////////////////////////////////////////////////////////////////////
+    // Hessian
 
-        return Dxy;
-    }
+    __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
+    __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
+    __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
 
-    __device__ float calcScale(int hidx_z)
+    __host__ __device__ int calcSize(int octave, int layer)
     {
-        float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2);
-        return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f;
-    }
-    
-    __global__ void fasthessian(PtrStepf hessianBuffer)
-    {
-  	    // Determine the indices in the Hessian buffer
-        int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
-        int hidx_y = threadIdx.y + blockIdx.y * blockDim.y;
-        int hidx_z = threadIdx.z;
-
-        float fscale = calcScale(hidx_z);
+        /* Wavelet size at first layer of first octave. */
+        const int HAAR_SIZE0 = 9;
 
-        // Compute the lookup location of the mask center
-        float x = hidx_x * c_step + c_border;
-        float y = hidx_y * c_step + c_border;
+        /* Wavelet size increment between layers. This should be an even number,
+         such that the wavelet sizes in an octave are either all even or all odd.
+         This ensures that when looking for the neighbours of a sample, the layers
+         above and below are aligned correctly. */
+        const int HAAR_SIZE_INC = 6;
 
-        // Scale the mask dimensions according to the scale
-        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
-        {
-            float mask_width =  c_mask_width  * fscale;
-            float mask_height = c_mask_height * fscale;
-
-            // Compute the filter responses
-            float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
-            float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
-            float Dxy = evalDxy(x, y, fscale);
-
-            // Combine the responses and store the Laplacian sign
-            float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
-
-            if (Dxx + Dyy > 0.f)
-                setLastBit(result);
-            else
-                clearLastBit(result);
-
-            hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
-        }
+        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
     }
 
-    __global__ void fasthessian_old(PtrStepf hessianBuffer)
+    __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
     {
-  	    // Determine the indices in the Hessian buffer
-        int gridDim_y = gridDim.y / c_nIntervals;
-        int blockIdx_y = blockIdx.y % gridDim_y;
-        int blockIdx_z = blockIdx.y / gridDim_y;
-
-        int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
-        int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;
-        int hidx_z = blockIdx_z;
+        // Determine the indices
+        const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
+        const int blockIdx_y = blockIdx.y % gridDim_y;
+        const int blockIdx_z = blockIdx.y / gridDim_y;
 
-        float fscale = calcScale(hidx_z);
+        const int j = threadIdx.x + blockIdx.x * blockDim.x;
+        const int i = threadIdx.y + blockIdx_y * blockDim.y;
+        const int layer = blockIdx_z;
 
-        // Compute the lookup location of the mask center
-        float x = hidx_x * c_step + c_border;
-        float y = hidx_y * c_step + c_border;
+        const int size = calcSize(c_octave, layer);
 
-        // Scale the mask dimensions according to the scale
-        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
-        {
-            float mask_width =  c_mask_width  * fscale;
-            float mask_height = c_mask_height * fscale;
-
-            // Compute the filter responses
-            float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
-            float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
-            float Dxy = evalDxy(x, y, fscale);
+        const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
+        const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
 
-            // Combine the responses and store the Laplacian sign
-            float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
+        /* Ignore pixels where some of the kernel is outside the image */
+        const int margin = (size >> 1) >> c_octave;
 
-            if (Dxx + Dyy > 0.f)
-                setLastBit(result);
-            else
-                clearLastBit(result);
+        if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
+        {
+            const float dx  = icvCalcHaarPattern<3>(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
+            const float dy  = icvCalcHaarPattern<3>(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
+            const float dxy = icvCalcHaarPattern<4>(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
 
-            hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
+            det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
+            trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
         }
     }
 
-    dim3 calcBlockSize(int nIntervals)
-    {
-        int threadsPerBlock = 512;
-        
-        dim3 threads;        
-        threads.z = nIntervals;
-        threadsPerBlock /= nIntervals;
-        if (threadsPerBlock >= 48)
-            threads.x = 16;
-        else
-            threads.x = 8;
-        threadsPerBlock /= threads.x;
-        threads.y = threadsPerBlock;
-        
-        return threads;
-    }
-
-    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)
+    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
     {
-        dim3 grid;
-        grid.x = divUp(x_size, threads.x);
-        grid.y = divUp(y_size, threads.y);
-        
-        fasthessian<<<grid, threads>>>(hessianBuffer);
-        cudaSafeCall( cudaGetLastError() );
+        const int min_size = calcSize(octave, 0);
+        const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
+        const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
 
-        cudaSafeCall( cudaThreadSynchronize() );
-    }
-
-    void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)
-    {
         dim3 threads(16, 16);
 
         dim3 grid;
-        grid.x = divUp(x_size, threads.x);
-        grid.y = divUp(y_size, threads.y) * threadsOld.z;
-        
-        fasthessian_old<<<grid, threads>>>(hessianBuffer);
+        grid.x = divUp(max_samples_j, threads.x);
+        grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
+
+        icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaThreadSynchronize() );
@@ -325,109 +190,113 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////
     // NONMAX
     
-    texture<int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+    IntTex maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
 
     struct WithOutMask
     {
-        static __device__ bool check(float, float, float)
+        static __device__ bool check(int, int, int)
         {
             return true;
         }
     };
+
+    __constant__ float c_DM[1][5] = {{0, 0, 9, 9, 1}};
+
     struct WithMask
     {
-        static __device__ bool check(float x, float y, float fscale)
+        static __device__ bool check(int sum_i, int sum_j, int size)
         {
-            float half_width = fscale / 2;
-
-            float result = 0.f;
-
-            result += tex2D(maskSumTex, x - half_width, y - half_width);
-            result -= tex2D(maskSumTex, x + half_width, y - half_width);
-            result -= tex2D(maskSumTex, x - half_width, y + half_width);
-            result += tex2D(maskSumTex, x + half_width, y + half_width);
-
-            result /= (fscale * fscale);
-
-            return (result >= 0.5f);
+            float mval = icvCalcHaarPattern<1>(maskSumTex, c_DM , 9, size, sum_i, sum_j);
+            return (mval >= 0.5);
         }
     };
 
     template <typename Mask>
-    __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)
+    __global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
     {
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
-        extern __shared__ float fh_vals[];
+        extern __shared__ float N9[];
 
         // The hidx variables are the indices to the hessian buffer.
-        int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2);
-        int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2);
-        int hidx_z = threadIdx.z;
-        int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+        const int gridDim_y = gridDim.y / c_nOctaveLayers;
+        const int blockIdx_y = blockIdx.y % gridDim_y;
+        const int blockIdx_z = blockIdx.y / gridDim_y;
 
-        // Is this thread within the hessian buffer?
-        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
-        {
-            fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
-        }
-        __syncthreads();
+        const int layer = blockIdx_z + 1;
+
+        const int size = calcSize(c_octave, layer);
 
-        // Is this location one of the ones being processed for nonmax suppression.
-        // Blocks overlap by one so we don't process the border threads.
-        bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1 
-            &&           threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1
-            &&           threadIdx.z > 0 && threadIdx.z < blockDim.z-1;
+        /* Ignore pixels without a 3x3x3 neighbourhood in the layer above */
+        const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
 
-        float val = fh_vals[localLin];
+        const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
+        const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
 
-        // Compute the lookup location of the mask center
-        float x = hidx_x * c_step + c_border;
-        float y = hidx_y * c_step + c_border;
-        float fscale = calcScale(hidx_z);
+        // Is this thread within the hessian buffer?
+        const int zoff = blockDim.x * blockDim.y;
+        const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
+        N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + i)[j];
+        N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + i)[j];
+        N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + i)[j];
+        __syncthreads();
 
-        if (inBounds2 && val >= c_threshold && Mask::check(x, y, fscale))
+        if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
         {
-            // Check to see if we have a max (in its 26 neighbours)
-            int zoff = blockDim.x * blockDim.y;
-            bool condmax  =  val > fh_vals[localLin                     + 1]
-            &&               val > fh_vals[localLin                     - 1]
-            &&               val > fh_vals[localLin        - blockDim.x + 1]
-            &&               val > fh_vals[localLin        - blockDim.x    ]
-            &&               val > fh_vals[localLin        - blockDim.x - 1]
-            &&               val > fh_vals[localLin        + blockDim.x + 1]
-            &&               val > fh_vals[localLin        + blockDim.x    ]
-            &&               val > fh_vals[localLin        + blockDim.x - 1]
-
-            &&               val > fh_vals[localLin - zoff              + 1]
-            &&               val > fh_vals[localLin - zoff                 ]
-            &&               val > fh_vals[localLin - zoff              - 1]
-            &&               val > fh_vals[localLin - zoff - blockDim.x + 1]
-            &&               val > fh_vals[localLin - zoff - blockDim.x    ]
-            &&               val > fh_vals[localLin - zoff - blockDim.x - 1]
-            &&               val > fh_vals[localLin - zoff + blockDim.x + 1]
-            &&               val > fh_vals[localLin - zoff + blockDim.x    ]
-            &&               val > fh_vals[localLin - zoff + blockDim.x - 1]
-
-            &&               val > fh_vals[localLin + zoff              + 1]
-            &&               val > fh_vals[localLin + zoff                 ]
-            &&               val > fh_vals[localLin + zoff              - 1]
-            &&               val > fh_vals[localLin + zoff - blockDim.x + 1]
-            &&               val > fh_vals[localLin + zoff - blockDim.x    ]
-            &&               val > fh_vals[localLin + zoff - blockDim.x - 1]
-            &&               val > fh_vals[localLin + zoff + blockDim.x + 1]
-            &&               val > fh_vals[localLin + zoff + blockDim.x    ]
-            &&               val > fh_vals[localLin + zoff + blockDim.x - 1]
-            ;
-
-            if(condmax) 
+            float val0 = N9[localLin];
+
+            if (val0 > c_hessianThreshold)
             {
-                unsigned int i = atomicInc(maxCounter,(unsigned int) -1);
+                // Coordinates for the start of the wavelet in the sum image. There
+                // is some integer division involved, so don't try to simplify this
+                // (cancel out sampleStep) without checking the result is the same
+                const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
+                const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
 
-                if (i < c_max_candidates) 
+                if (Mask::check(sum_i, sum_j, size))
                 {
-                    int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};
-                    maxPosBuffer[i] = f;
+                    // Check to see if we have a max (in its 26 neighbours)
+                    const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
+                    &&                   val0 > N9[localLin     - blockDim.x - zoff]
+                    &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]
+                    &&                   val0 > N9[localLin - 1              - zoff]
+                    &&                   val0 > N9[localLin                  - zoff]
+                    &&                   val0 > N9[localLin + 1              - zoff]
+                    &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]
+                    &&                   val0 > N9[localLin     + blockDim.x - zoff]
+                    &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]
+
+                    &&                   val0 > N9[localLin - 1 - blockDim.x]
+                    &&                   val0 > N9[localLin     - blockDim.x]
+                    &&                   val0 > N9[localLin + 1 - blockDim.x]
+                    &&                   val0 > N9[localLin - 1             ]
+                    &&                   val0 > N9[localLin + 1             ]
+                    &&                   val0 > N9[localLin - 1 + blockDim.x]
+                    &&                   val0 > N9[localLin     + blockDim.x]
+                    &&                   val0 > N9[localLin + 1 + blockDim.x]
+
+                    &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]
+                    &&                   val0 > N9[localLin     - blockDim.x + zoff]
+                    &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]
+                    &&                   val0 > N9[localLin - 1              + zoff]
+                    &&                   val0 > N9[localLin                  + zoff]
+                    &&                   val0 > N9[localLin + 1              + zoff]
+                    &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]
+                    &&                   val0 > N9[localLin     + blockDim.x + zoff]
+                    &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]
+                    ;
+
+                    if(condmax)
+                    {
+                        unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
+
+                        if (ind < c_max_candidates)
+                        {
+                            const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
+
+                            maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
+                        }
+                    }
                 }
             }
         }
@@ -435,21 +304,26 @@ namespace cv { namespace gpu { namespace surf
         #endif
     }
 
-    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int x_size, int y_size, bool use_mask, const dim3& threads)
+    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
     {
-        dim3 grid;
-        grid.x = divUp(x_size, threads.x - 2);
-        grid.y = divUp(y_size, threads.y - 2);
+        const int layer_rows = img_rows >> octave;
+        const int layer_cols = img_cols >> octave;
+
+        int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
+
+        dim3 threads(16, 16);
 
-        const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
+        dim3 grid;
+        grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
+        grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
 
-        DeviceReference<unsigned int> maxCounterWrapper(maxCounter);
+        const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
 
         if (use_mask)
-            nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
+            icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
         else
-            nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
+            icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
 
         cudaSafeCall( cudaGetLastError() );
 
@@ -459,166 +333,117 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////
     // INTERPOLATION
     
-    #define MID_IDX 1
-    __global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer, 
-        KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
+    __global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
     {
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
-        int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x;
-        int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y;
-        int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z;
+        const int4 maxPos = maxPosBuffer[blockIdx.x];
+
+        const int j = maxPos.x - 1 + threadIdx.x;
+        const int i = maxPos.y - 1 + threadIdx.y;
+        const int layer = maxPos.z - 1 + threadIdx.z;
 
-        __shared__ float fh_vals[3][3][3];
+        __shared__ float N9[3][3][3];
         __shared__ KeyPoint_GPU p;
 
-        fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
+         N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
         __syncthreads();
 
         if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
         {
-            __shared__ float H[3][3];
-
-            //dxx
-            H[0][0] =    fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX    ] 
-            -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]
-            +            fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX    ];
+            __shared__ float dD[3];
 
-            //dyy
-            H[1][1] =    fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX + 1]
-            -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]
-            +            fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX - 1];
+            //dx
+            dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
+            //dy
+            dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
+            //ds
+            dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
 
-            //dss
-            H[2][2] =    fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX    ] 
-            -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]
-            +            fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX    ];
+            __shared__ float H[3][3];
 
+            //dxx
+            H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
             //dxy
-            H[0][1]= 0.25f*
-                (fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX + 1] -
-                 fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX + 1] -
-                 fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX - 1] +
-                 fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX - 1]);
-
+            H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
             //dxs
-            H[0][2]= 0.25f*
-                (fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX    ] -
-                 fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX    ] -
-                 fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX    ] +
-                 fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX    ]);
-
-            //dys
-            H[1][2]= 0.25f*
-                (fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX + 1] -
-                 fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX - 1] -
-                 fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX + 1] +
-                 fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX - 1]);
-
+            H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
             //dyx = dxy
             H[1][0] = H[0][1];
-
+            //dyy
+            H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
+            //dys
+            H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
             //dsx = dxs
             H[2][0] = H[0][2];
-
             //dsy = dys
             H[2][1] = H[1][2];
+            //dss
+            H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
 
-            __shared__ float dD[3];
+            float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
+              - H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
+              + H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
 
-            //dx
-            dD[0] = 0.5f*(fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX    ] -
-                fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX    ]);
-            //dy
-            dD[1] = 0.5f*(fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX + 1] -
-                fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX - 1]);
-            //ds
-            dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX    ] -
-                fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX    ]);
-
-            __shared__ float invdet;
-            invdet = 1.f /
-                (
-                H[0][0]*H[1][1]*H[2][2] 
-                +   H[0][1]*H[1][2]*H[2][0]
-                +   H[0][2]*H[1][0]*H[2][1]
-                -   H[0][0]*H[1][2]*H[2][1]
-                -   H[0][1]*H[1][0]*H[2][2]
-                -   H[0][2]*H[1][1]*H[2][0]
-                );
-
-            //   // 1-based entries of a 3x3 inverse
-            //   /*             [ |a22 a23|   |a12 a13|  |a12 a13|]     */
-            //   /*             [ |a32 a33|  -|a32 a33|  |a22 a23|]     */
-            //   /*             [                                 ]     */
-            //   /*             [ |a21 a23|   |a11 a13|  |a11 a13|]     */
-            //   /*    A^(-1) = [-|a31 a33|   |a31 a33| -|a21 a23|] / d */
-            //   /*             [                                 ]     */
-            //   /*             [ |a21 a22|   |a11 a12|  |a11 a12|]     */
-            //   /*             [ |a31 a32|  -|a31 a32|  |a21 a22|]     */
-
-            __shared__ float Hinv[3][3];
-            Hinv[0][0] =  invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]);
-            Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]);
-            Hinv[0][2] =  invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]);
-
-            Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]);
-            Hinv[1][1] =  invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]);
-            Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]);
-
-            Hinv[2][0] =  invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]);
-            Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]);
-            Hinv[2][2] =  invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]);
-
-            __shared__ float x[3];
-
-            x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2]));
-            x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2]));
-            x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));
-
-            if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f) 
+            if (det != 0.0f)
             {
-                // if the step is within the interpolation region, perform it
+                float invdet = 1.0f / det;
 
-                // Get a new feature index.
-                unsigned int i = atomicInc(featureCounter, (unsigned int)-1);
+                __shared__ float x[3];
 
-                if (i < c_max_features)
+                x[0] = invdet * 
+                    (dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
+                     H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
+                     H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
+
+                x[1] = invdet * 
+                    (H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
+                     dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
+                     H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
+
+                x[2] = invdet * 
+                    (H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
+                     H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
+                     dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
+
+                if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
                 {
-                    p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;
-                    p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;
+                    // if the step is within the interpolation region, perform it
 
-                    if (x[2] > 0)
-                    {
-                        float a = calcScale(maxPosBuffer[blockIdx.x].z);
-                        float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);
+                    // Get a new feature index.
+                    unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
 
-                        p.size = (1.f - x[2]) * a + x[2] * b;
-                    }
-                    else
+                    if (ind < c_max_features)
                     {
-                        float a = calcScale(maxPosBuffer[blockIdx.x].z);
-                        float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);
+                        const int size = calcSize(c_octave, maxPos.z);
 
-                        p.size = (1.f + x[2]) * a - x[2] * b;
-                    }
+                        const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
+                        const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
+                        
+                        const float center_i = sum_i + (float)(size - 1) / 2;
+                        const float center_j = sum_j + (float)(size - 1) / 2;
 
-                    p.octave = c_octave;
+                        p.x = center_j + x[0] * (1 << c_octave);
+                        p.y = center_i + x[1] * (1 << c_octave);
 
-                    p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];
+                        int ds = size - calcSize(c_octave, maxPos.z - 1);
+                        p.size = roundf(size + x[2] * ds);
 
-                    // Should we split up this transfer over many threads?
-                    featuresBuffer[i] = p;
-                }
-            } // If the subpixel interpolation worked
+                        p.laplacian = maxPos.w;
+                        p.dir = 0.0f;
+                        p.hessian = N9[1][1][1];
+
+                        // Should we split up this transfer over many threads?
+                        featuresBuffer[ind] = p;
+                    }
+                } // If the subpixel interpolation worked
+            }
         } // If this is thread 0.
 
         #endif
     }
-    #undef MID_IDX
 
-    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
-        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter)
+    void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
     {
         dim3 threads;
         threads.x = 3;
@@ -628,9 +453,7 @@ namespace cv { namespace gpu { namespace surf
         dim3 grid;
         grid.x = maxCounter;
 
-        DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
-
-        fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
+        icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featuresBuffer, featureCounter);
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaThreadSynchronize() );
@@ -639,139 +462,217 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////
     // Orientation
 
-    // precomputed values for a Gaussian with a standard deviation of 2
-    __constant__ float c_gauss1D[13] = 
-    {
-        0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f, 
-        0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f, 
-        0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f, 
-        0.002215924206f
-    };
+    #define ORI_SEARCH_INC 5
+    #define ORI_WIN        60
+    #define ORI_SAMPLES    113
 
-    __global__ void find_orientation(KeyPoint_GPU* features)
-    {
-        int tid = threadIdx.y * 17 + threadIdx.x;
-        int tid2 = numeric_limits_gpu<int>::max();
+    __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
+    __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
+    __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
+    
+    __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
+    __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
 
-        if (threadIdx.x < 13 && threadIdx.y < 13) 
-        {
-            tid2 = threadIdx.y * 13 + threadIdx.x;
-        }
+    __global__ void icvCalcOrientation(const KeyPoint_GPU* featureBuffer, KeyPoint_GPU* keypoints, unsigned int* keypointCounter)
+    {        
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
 
-        __shared__ float texLookups[17][17];
+        __shared__ float s_X[128];
+        __shared__ float s_Y[128];
+        __shared__ float s_angle[128];
 
-        __shared__ float Edx[13*13];
-        __shared__ float Edy[13*13];
-        __shared__ float xys[3];
+        __shared__ float s_sumx[64 * 4];
+        __shared__ float s_sumy[64 * 4];
 
-        // Read my x, y, size.
-        if (tid < 3)
-        {
-            xys[tid] = ((float*)(&features[blockIdx.x]))[tid];
-        }
+        __shared__ float s_feature[6];
+
+        if (threadIdx.x < 6 && threadIdx.y == 0)
+            s_feature[threadIdx.x] = ((float*)(&featureBuffer[blockIdx.x]))[threadIdx.x];
         __syncthreads();
 
-        // Read all texture locations into memory
-        // Maybe I should use __mul24 here?
-        texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE], 
-                  xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]);
 
-        __syncthreads();
+        /* The sampling intervals and wavelet sized for selecting an orientation
+         and building the keypoint descriptor are defined relative to 's' */
+        const float s = s_feature[SF_SIZE] * 1.2f / 9.0f;
 
-        float dx = 0.f;
-        float dy = 0.f;
+        /* To find the dominant orientation, the gradients in x and y are
+         sampled in a circle of radius 6s using wavelets of size 4s.
+         We ensure the gradient wavelet size is even to ensure the
+         wavelet pattern is balanced and symmetric around its center */
+        const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
 
-        // Computes lookups for all points in a 13x13 lattice.
-        // - SURF says to only use a circle, but the branching logic would slow it down
-        // - Gaussian weighting should reduce the effects of the outer points anyway
-        if (tid2 < 169)
+        // check when grad_wav_size is too big
+        if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
         {
-            dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];
-            dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];
-            dx -=     texLookups[threadIdx.x + 4][threadIdx.y    ];
-            dx +=     texLookups[threadIdx.x    ][threadIdx.y + 4];
-            dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
-            dx +=     texLookups[threadIdx.x + 4][threadIdx.y + 4];
-
-            dy -=     texLookups[threadIdx.x    ][threadIdx.y    ];
-            dy += 2.f*texLookups[threadIdx.x    ][threadIdx.y + 2];
-            dy -=     texLookups[threadIdx.x    ][threadIdx.y + 4];
-            dy +=     texLookups[threadIdx.x + 4][threadIdx.y    ];
-            dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
-            dy +=     texLookups[threadIdx.x + 4][threadIdx.y + 4];
-
-            float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];
-
-            Edx[tid2] = dx * g;
-            Edy[tid2] = dy * g;
-        }
+            // Calc X, Y, angle and store it to shared memory
+            {
+                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-        __syncthreads();
+                float X = 0.0f, Y = 0.0f, angle = 0.0f;
 
-        // This is a scan to get the summed dx, dy values.
-        // Gets 128-168
-        if (tid < 41)
-        {
-            Edx[tid] += Edx[tid + 128];
-        } 
-        __syncthreads();
-        if (tid < 64)
-        {
-            Edx[tid] += Edx[tid + 64];
-        }
-        __syncthreads();
-        if (tid < 32)
-        {
-            volatile float* smem = Edx;
-
-            smem[tid] += smem[tid + 32];
-            smem[tid] += smem[tid + 16];
-            smem[tid] += smem[tid + 8];
-            smem[tid] += smem[tid + 4];
-            smem[tid] += smem[tid + 2];
-            smem[tid] += smem[tid + 1];
-        }
+                if (tid < ORI_SAMPLES)
+                {
+                    const float margin = (float)(grad_wav_size - 1) / 2.0f;
+                    const int x = __float2int_rn(s_feature[SF_X] + c_aptX[tid] * s - margin);
+                    const int y = __float2int_rn(s_feature[SF_Y] + c_aptY[tid] * s - margin);
 
-        // Gets 128-168
-        if (tid < 41) 
-        {
-            Edy[tid] += Edy[tid + 128];
-        }
-        __syncthreads();
-        if (tid < 64)
-        {
-            Edy[tid] += Edy[tid + 64];
-        }
-        __syncthreads();
-        if (tid < 32)
-        {
-            volatile float* smem = Edy;
-
-            smem[tid] += smem[tid + 32];
-            smem[tid] += smem[tid + 16];
-            smem[tid] += smem[tid + 8];
-            smem[tid] += smem[tid + 4];
-            smem[tid] += smem[tid + 2];
-            smem[tid] += smem[tid + 1];
-        }
+                    if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
+                    {
+                        X = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NX, 4, grad_wav_size, y, x);
+                        Y = c_aptW[tid] * icvCalcHaarPattern<2>(sumTex, c_NY, 4, grad_wav_size, y, x);
+                    
+                        angle = atan2f(Y, X);
+                        if (angle < 0)
+                            angle += 2.0f * CV_PI;
+                        angle *= 180.0f / CV_PI;
+                    }
+                }
+                if (tid < 128)
+                {
+                    s_X[tid] = X;
+                    s_Y[tid] = Y;
+                    s_angle[tid] = angle;
+                }
+            }
+            __syncthreads();
 
-        // Thread 0 saves back the result.
-        if (tid == 0)
-        {
-            features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);
+            float bestx = 0, besty = 0, best_mod = 0;
+
+            #pragma unroll
+            for (int i = 0; i < 18; ++i)
+            {
+                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
+
+                float sumx = 0.0f, sumy = 0.0f;
+                int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx = s_X[threadIdx.x];
+                    sumy = s_Y[threadIdx.x];
+                }
+                d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 64];
+                    sumy += s_Y[threadIdx.x + 64];
+                }
+
+                float* s_sumx_row = s_sumx + threadIdx.y * 64;
+                float* s_sumy_row = s_sumy + threadIdx.y * 64;
+
+                s_sumx_row[threadIdx.x] = sumx;
+                s_sumy_row[threadIdx.x] = sumy;
+                __syncthreads();
+
+                if (threadIdx.x < 32)
+                {
+                    volatile float* v_sumx_row = s_sumx_row;
+                    volatile float* v_sumy_row = s_sumy_row;
+
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];
+
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];
+
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];
+
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];
+
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];
+
+                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];
+                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];
+                }
+
+                const float temp_mod = sumx * sumx + sumy * sumy;
+                if (temp_mod > best_mod)
+                {
+                    best_mod = temp_mod;
+                    bestx = sumx;
+                    besty = sumy;
+                }
+                __syncthreads();
+            }
+
+            if (threadIdx.x == 0)
+            {
+                s_X[threadIdx.y] = bestx;
+                s_Y[threadIdx.y] = besty;
+                s_angle[threadIdx.y] = best_mod;
+            }
+            __syncthreads();
+
+            if (threadIdx.x < 2 && threadIdx.y == 0)
+            {
+                volatile float* v_x = s_X;
+                volatile float* v_y = s_Y;
+                volatile float* v_mod = s_angle;
+
+                bestx = v_x[threadIdx.x];
+                besty = v_y[threadIdx.x];
+                best_mod = v_mod[threadIdx.x];
+
+                float temp_mod = v_mod[threadIdx.x + 2];
+                if (temp_mod > best_mod)
+                {
+                    v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
+                    v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
+                    v_mod[threadIdx.x] = best_mod = temp_mod;
+                }
+                temp_mod = v_mod[threadIdx.x + 1];
+                if (temp_mod > best_mod)
+                {
+                    v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
+                    v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
+                }
+            }
+
+            if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
+            {
+                // Get a new feature index.
+                unsigned int ind = atomicInc(keypointCounter, (unsigned int)-1);
+
+                if (ind < c_max_keypoints)
+                {
+                    float kp_dir = atan2f(besty, bestx);
+                    if (kp_dir < 0)
+                        kp_dir += 2.0f * CV_PI;
+                    kp_dir *= 180.0f / CV_PI;
+                    __shared__ KeyPoint_GPU kp;
+
+                    kp.x = s_feature[SF_X];
+                    kp.y = s_feature[SF_Y];
+                    kp.laplacian = s_feature[SF_LAPLACIAN];
+                    kp.size = s_feature[SF_SIZE];
+                    kp.dir = kp_dir;
+                    kp.hessian = s_feature[SF_HESSIAN];
+
+                    keypoints[ind] = kp;
+                }
+            }
         }
+
+        #endif
     }
 
-    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures) 
+    #undef ORI_SEARCH_INC
+    #undef ORI_WIN
+    #undef ORI_SAMPLES
+
+    void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter) 
     {
         dim3 threads;
-        threads.x = 17;
-        threads.y = 17;
+        threads.x = 64;
+        threads.y = 4;
 
         dim3 grid;
         grid.x = nFeatures;
 
-        find_orientation<<<grid, threads>>>(features);
+        icvCalcOrientation<<<grid, threads>>>(featureBuffer, keypoints, keypointCounter);
         cudaSafeCall( cudaGetLastError() );
 
         cudaSafeCall( cudaThreadSynchronize() );
@@ -780,117 +681,119 @@ namespace cv { namespace gpu { namespace surf
     ////////////////////////////////////////////////////////////////////////
     // Descriptors
 
-    // precomputed values for a Gaussian with a standard deviation of 3.3
-    // - it appears SURF uses a different value, but not sure what it is
-    __constant__ float c_3p3gauss1D[20] =
-    {
-        0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
-        0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
-        0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
-        0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f
+    #define PATCH_SZ 20
+
+    texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = 
+    {
+        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, 
+        8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
+        1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
+        3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
+        5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
+        9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
+        0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
+        0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
+        0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
+        0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
+        0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
+        0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
+        0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
+        0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
+        9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
+        5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
+        3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
+        1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
+        8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
+        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
     };
 
-    template <int BLOCK_DIM_X>
-    __global__ void normalize_descriptors(PtrStepf descriptors)
+    __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)
     {
-        // no need for thread ID
-        float* descriptor_base = descriptors.ptr(blockIdx.x);
+        const float centerX = s_pt[SF_X];
+        const float centerY = s_pt[SF_Y];
+        const float size = s_pt[SF_SIZE];
+        const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);
 
-        // read in the unnormalized descriptor values (squared)
-        __shared__ float sqDesc[BLOCK_DIM_X];
-        const float lookup = descriptor_base[threadIdx.x];
-        sqDesc[threadIdx.x] = lookup * lookup;
-        __syncthreads();
+        /* The sampling intervals and wavelet sized for selecting an orientation
+         and building the keypoint descriptor are defined relative to 's' */
+        const float s = size * 1.2f / 9.0f;
 
-        if (BLOCK_DIM_X >= 128)
-        {
-            if (threadIdx.x < 64)
-                sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
-            __syncthreads();
-        }
+        /* Extract a window of pixels around the keypoint of size 20s */
+        const int win_size = (int)((PATCH_SZ + 1) * s);
 
-        // reduction to get total
-        if (threadIdx.x < 32)
-        {
-            volatile float* smem = sqDesc;
+        float sin_dir;
+        float cos_dir;
+        sincosf(descriptor_dir, &sin_dir, &cos_dir);
 
-            smem[threadIdx.x] += smem[threadIdx.x + 32];
-            smem[threadIdx.x] += smem[threadIdx.x + 16];
-            smem[threadIdx.x] += smem[threadIdx.x + 8];
-            smem[threadIdx.x] += smem[threadIdx.x + 4];
-            smem[threadIdx.x] += smem[threadIdx.x + 2];
-            smem[threadIdx.x] += smem[threadIdx.x + 1];
-        }
+        /* Nearest neighbour version (faster) */
+        const float win_offset = -(float)(win_size - 1) / 2; 
 
-        // compute length (square root)
-        __shared__ float len;
-        if (threadIdx.x == 0)
-        {
-            len = sqrtf(sqDesc[0]);
-        }
-        __syncthreads();
+        /* Scale the window to size PATCH_SZ so each pixel's size is s. This
+           makes calculating the gradients with wavelets of size 2s easy */
+        const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
+        const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
 
-        // normalize and store in output
-        descriptor_base[threadIdx.x] = lookup / len;
-    }
+        const int i = __float2int_rd(icoo);
+        const int j = __float2int_rd(jcoo);
 
-    __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,
-                               int xIndex, int yIndex, int tid)
-    {
-        float sin_theta, cos_theta;
-        sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
-
-        // Compute rotated sampling points
-        // (clockwise rotation since we are rotating the lattice)
-        // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
-        const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
-            + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
-        const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
-            + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
-
-        // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
-        // a b c
-        // d   f
-        // g h i
-        const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
-        const float b = tex2D(sumTex, sample_x,                sample_y - ipt[SF_SIZE]);
-        const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
-        const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
-        const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
-        const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
-        const float h = tex2D(sumTex, sample_x,                sample_y + ipt[SF_SIZE]);
-        const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
-
-        // compute axis-aligned HaarX, HaarY
-        // (could group the additions together into multiplications)
-        const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
-        const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));            // unrotated dx
-        const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));            // unrotated dy
-
-        // rotate responses (store all dxs then all dys)
-        // - counterclockwise rotation to rotate back to zero orientation
-        sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
-        sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
+        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
+
+        float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);
+
+        pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;
+        pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;
+
+        res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);
+
+        pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;
+        pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;
+
+        res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);
+        
+        pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;
+        pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;
+
+        res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);
+
+        s_PATCH[i2][j2] = (unsigned char)res;
     }
 
-    __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
+    __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)
     {
         // get the interest point parameters (x, y, size, response, angle)
-        __shared__ float ipt[5];
-        if (threadIdx.x < 5 && threadIdx.y == 0)
+        __shared__ float s_pt[5];
+        if (tid < 5)
         {
-            ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
+            s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];
         }
         __syncthreads();
 
         // Compute sampling points
         // since grids are 2D, need to compute xBlock and yBlock indices
-        const int xBlock = (threadIdx.y & 3);  // threadIdx.y % 4
-        const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)
-        const int xIndex = (xBlock * 5) + (threadIdx.x % 5);
-        const int yIndex = (yBlock * 5) + (threadIdx.x / 5);
+        const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
+        const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
+        const int xIndex = xBlock * blockDim.x + threadIdx.x;
+        const int yIndex = yBlock * blockDim.y + threadIdx.y;
+
+        calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);
+        if (threadIdx.x == 0)
+            calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);
+        if (threadIdx.y == 0)
+            calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);
+        __syncthreads();
+
+        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
+
+        const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
+        const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
 
-        calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);
+        s_dx_bin[tid] = vx;
+        s_dy_bin[tid] = vy;
     }
 
     __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,
@@ -934,192 +837,8 @@ namespace cv { namespace gpu { namespace surf
     // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
     __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
     {
-        // 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
-        __shared__ float sdx   [16 * 25];
-        __shared__ float sdy   [16 * 25];
-        __shared__ float sdxabs[16 * 25];
-        __shared__ float sdyabs[16 * 25];
-
-        __shared__ float sdesc[64];
-
-        float* sdx_bin    = sdx    + (threadIdx.y * 25);
-        float* sdy_bin    = sdy    + (threadIdx.y * 25);
-        float* sdxabs_bin = sdxabs + (threadIdx.y * 25);
-        float* sdyabs_bin = sdyabs + (threadIdx.y * 25);
-
-        calc_dx_dy(sdx_bin, sdy_bin, features);
-        __syncthreads();
-
-        sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array
-        sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array
-        __syncthreads();
-
-        reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);
-        __syncthreads();
-
-        float* sdesc_bin = sdesc + (threadIdx.y << 2);
-
-        // write dx, dy, |dx|, |dy|
-        if (threadIdx.x == 0)
-        {
-            sdesc_bin[0] = sdx_bin[0];
-            sdesc_bin[1] = sdy_bin[0];
-            sdesc_bin[2] = sdxabs_bin[0];
-            sdesc_bin[3] = sdyabs_bin[0];
-        }
-        __syncthreads();
-
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        if (tid < 64)
-            descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
-    }
-
-    // Spawn 16 blocks per interest point
-    // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
-    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
-    {
-        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
-        __shared__ float sdx[16 * 25];
-        __shared__ float sdy[16 * 25];
-
-        // sum (reduce) 5x5 area response
-        __shared__ float sd1[16 * 25];
-        __shared__ float sd2[16 * 25];
-        __shared__ float sdabs1[16 * 25];
-        __shared__ float sdabs2[16 * 25];
-
-        __shared__ float sdesc[128];
-
-        float* sdx_bin    = sdx    + (threadIdx.y * 25);
-        float* sdy_bin    = sdy    + (threadIdx.y * 25);
-        float* sd1_bin    = sd1    + (threadIdx.y * 25);
-        float* sd2_bin    = sd2    + (threadIdx.y * 25);
-        float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);
-        float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);
-
-        calc_dx_dy(sdx_bin, sdy_bin, features);
-        __syncthreads();
-
-        if (sdy_bin[threadIdx.x] >= 0)
-        {
-            sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];
-            sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);
-            sd2_bin[threadIdx.x] = 0;
-            sdabs2_bin[threadIdx.x] = 0;
-        }
-        else
-        {
-            sd1_bin[threadIdx.x] = 0;
-            sdabs1_bin[threadIdx.x] = 0;
-            sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];
-            sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);
-        }
-        __syncthreads();
-
-        reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
-        __syncthreads();
-
-        float* sdesc_bin = sdesc + (threadIdx.y << 3);
-
-        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
-        if (threadIdx.x == 0)
-        {
-                sdesc_bin[0] = sd1_bin[0];
-                sdesc_bin[1] = sdabs1_bin[0];
-                sdesc_bin[2] = sd2_bin[0];
-                sdesc_bin[3] = sdabs2_bin[0];
-        }
-        __syncthreads();
-
-        if (sdx_bin[threadIdx.x] >= 0)
-        {
-            sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];
-            sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
-            sd2_bin[threadIdx.x] = 0;
-            sdabs2_bin[threadIdx.x] = 0;
-        }
-        else
-        {
-            sd1_bin[threadIdx.x] = 0;
-            sdabs1_bin[threadIdx.x] = 0;
-            sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];
-            sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);
-        }
-        __syncthreads();
-
-        reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);
-        __syncthreads();
-
-        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
-        if (threadIdx.x == 0)
-        {
-                sdesc_bin[4] = sd1_bin[0];
-                sdesc_bin[5] = sdabs1_bin[0];
-                sdesc_bin[6] = sd2_bin[0];
-                sdesc_bin[7] = sdabs2_bin[0];
-        }
-        __syncthreads();
-
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-        if (tid < 128)
-            descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];
-    }
-
-    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
-    {
-        // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-        
-        if (descriptors.cols == 64)
-        {
-            compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaThreadSynchronize() );
-
-            normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaThreadSynchronize() );
-        }
-        else
-        {
-            compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaThreadSynchronize() );
-
-            normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaThreadSynchronize() );
-        }
-    }
-
-    __device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)
-    {
-        // get the interest point parameters (x, y, scale, strength, theta)
-        __shared__ float ipt[5];
-        if (tid < 5)
-        {
-            ipt[tid] = ((float*)&features[blockIdx.x])[tid];
-        }
-        __syncthreads();
-
-        // Compute sampling points
-        // since grids are 2D, need to compute xBlock and yBlock indices
-        const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
-        const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
-        const int xIndex = xBlock * blockDim.x + threadIdx.x;
-        const int yIndex = yBlock * blockDim.y + threadIdx.y;
-
-        calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);
-    }
-
-    // Spawn 16 blocks per interest point
-    // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
-    __global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
-    {
         // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+        __shared__ float s_PATCH[6][6];
         __shared__ float sdx[25];
         __shared__ float sdy[25];
         __shared__ float sdxabs[25];
@@ -1127,7 +846,7 @@ namespace cv { namespace gpu { namespace surf
 
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-        calc_dx_dy_old(sdx, sdy, features, tid);
+        calc_dx_dy(s_PATCH, sdx, sdy, features, tid);
         __syncthreads();
 
         sdxabs[tid] = fabs(sdx[tid]); // |dx| array
@@ -1151,9 +870,10 @@ namespace cv { namespace gpu { namespace surf
 
     // Spawn 16 blocks per interest point
     // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
-    __global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)
+    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
     {
         // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+        __shared__ float s_PATCH[6][6];
         __shared__ float sdx[25];
         __shared__ float sdy[25];
 
@@ -1165,7 +885,7 @@ namespace cv { namespace gpu { namespace surf
 
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-        calc_dx_dy_old(sdx, sdy, features, tid);
+        calc_dx_dy(s_PATCH, sdx, sdy, features, tid);
         __syncthreads();
 
         if (sdy[tid] >= 0)
@@ -1184,7 +904,7 @@ namespace cv { namespace gpu { namespace surf
         }
         __syncthreads();
 
-        reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
         __syncthreads();
 
         float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
@@ -1215,7 +935,7 @@ namespace cv { namespace gpu { namespace surf
         }
         __syncthreads();
 
-        reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
         __syncthreads();
 
         // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
@@ -1228,13 +948,56 @@ namespace cv { namespace gpu { namespace surf
         }
     }
 
-    void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
+    template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
+    {
+        // no need for thread ID
+        float* descriptor_base = descriptors.ptr(blockIdx.x);
+
+        // read in the unnormalized descriptor values (squared)
+        __shared__ float sqDesc[BLOCK_DIM_X];
+        const float lookup = descriptor_base[threadIdx.x];
+        sqDesc[threadIdx.x] = lookup * lookup;
+        __syncthreads();
+
+        if (BLOCK_DIM_X >= 128)
+        {
+            if (threadIdx.x < 64)
+                sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
+            __syncthreads();
+        }
+
+        // reduction to get total
+        if (threadIdx.x < 32)
+        {
+            volatile float* smem = sqDesc;
+
+            smem[threadIdx.x] += smem[threadIdx.x + 32];
+            smem[threadIdx.x] += smem[threadIdx.x + 16];
+            smem[threadIdx.x] += smem[threadIdx.x + 8];
+            smem[threadIdx.x] += smem[threadIdx.x + 4];
+            smem[threadIdx.x] += smem[threadIdx.x + 2];
+            smem[threadIdx.x] += smem[threadIdx.x + 1];
+        }
+
+        // compute length (square root)
+        __shared__ float len;
+        if (threadIdx.x == 0)
+        {
+            len = sqrtf(sqDesc[0]);
+        }
+        __syncthreads();
+
+        // normalize and store in output
+        descriptor_base[threadIdx.x] = lookup / len;
+    }
+
+    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
     {
         // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
         
         if (descriptors.cols == 64)
         {
-            compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
+            compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaThreadSynchronize() );
@@ -1246,7 +1009,7 @@ namespace cv { namespace gpu { namespace surf
         }
         else
         {
-            compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);            
+            compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);            
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaThreadSynchronize() );
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
index c16de76..ffdaabb 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -48,123 +48,93 @@ using namespace std;
 
 #if !defined (HAVE_CUDA)
 
+cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
+cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float) { throw_nogpu(); }
 int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
 void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
 void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }
 void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
 namespace cv { namespace gpu { namespace surf
 {
-    dim3 calcBlockSize(int nIntervals);
-    
-    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
-    void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld);
-    
-    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
-        int x_size, int y_size, bool use_mask, const dim3& threads);
-    
-    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
-        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
-    
-    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
-    
+    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
+
+    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
+
+    void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter);
+
+    void icvCalcOrientation_gpu(const KeyPoint_GPU* featureBuffer, int nFeatures, KeyPoint_GPU* keypoints, unsigned int* keypointCounter);
+
     void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
-    void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
 }}}
 
 using namespace cv::gpu::surf;
 
 namespace
 {
-    class SURF_GPU_Invoker : private SURFParams_GPU
+    class SURF_GPU_Invoker : private CvSURFParams
     {
     public:
-        SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) : 
-            SURFParams_GPU(surf),
+        SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
+            CvSURFParams(surf),
 
-            sum(surf.sum), sumf(surf.sumf),
+            sum(surf.sum), mask1(surf.mask1), maskSum(surf.maskSum), intBuffer(surf.intBuffer), det(surf.det), trace(surf.trace),
 
-            mask1(surf.mask1), maskSum(surf.maskSum),
-
-            hessianBuffer(surf.hessianBuffer), 
-            maxPosBuffer(surf.maxPosBuffer), 
-            featuresBuffer(surf.featuresBuffer),
+            maxPosBuffer(surf.maxPosBuffer), featuresBuffer(surf.featuresBuffer), keypointsBuffer(surf.keypointsBuffer),
 
             img_cols(img.cols), img_rows(img.rows),
 
-            use_mask(!mask.empty()),
-
-            mask_width(0), mask_height(0),
-
-            featureCounter(0), maxCounter(0)
+            use_mask(!mask.empty())
         {
             CV_Assert(!img.empty() && img.type() == CV_8UC1);
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
-            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);
-            CV_Assert(DeviceInfo().supports(GLOBAL_ATOMICS));
-
-            max_features = static_cast<int>(img.size().area() * featuresRatio);
-            max_candidates = static_cast<int>(1.5 * max_features);
-
-            CV_Assert(max_features > 0);
+            CV_Assert(nOctaves > 0 && nOctaveLayers > 0);
+            CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
 
-            featuresBuffer.create(1, max_features, CV_32FC(6));
-            maxPosBuffer.create(1, max_candidates, CV_32SC4);
+            maxKeypoints = static_cast<int>(img.size().area() * surf.keypointsRatio);
+            maxFeatures = static_cast<int>(1.5 * maxKeypoints);
+            maxCandidates = static_cast<int>(1.5 * maxFeatures);
 
-            mask_width = l2 * 0.5f;
-            mask_height = 1.0f + l1;
+            CV_Assert(maxKeypoints > 0);
+            
+            cudaSafeCall( cudaMalloc((void**)&d_counters, (nOctaves + 2) * sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemset(d_counters, 0, (nOctaves + 2) * sizeof(unsigned int)) );
 
-            // Dxy gap half-width
-            float dxy_center_offset = 0.5f * (l4 + l3);
-            // Dxy squares half-width
-            float dxy_half_width = 0.5f * l3;
+            uploadConstant("cv::gpu::surf::c_max_candidates",    maxCandidates);
+            uploadConstant("cv::gpu::surf::c_max_features",      maxFeatures);
+            uploadConstant("cv::gpu::surf::c_max_keypoints",     maxKeypoints);
+            uploadConstant("cv::gpu::surf::c_img_rows",          img_rows);
+            uploadConstant("cv::gpu::surf::c_img_cols",          img_cols);
+            uploadConstant("cv::gpu::surf::c_nOctaveLayers",     nOctaveLayers);
+            uploadConstant("cv::gpu::surf::c_hessianThreshold",  static_cast<float>(hessianThreshold));
 
-            // rescale edge_scale to fit with the filter dimensions
-            float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f);
-            
-            // Compute border required such that the filters don't overstep the image boundaries	        
-	        float smax0 = 2.0f * initialScale + 0.5f;
-            int border0 = static_cast<int>(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
-
-            int width0 = (img_cols - 2 * border0) / initialStep;
-            int height0 = (img_rows - 2 * border0) / initialStep;
-
-            uploadConstant("cv::gpu::surf::c_max_candidates",    max_candidates);
-            uploadConstant("cv::gpu::surf::c_max_features",      max_features);
-            uploadConstant("cv::gpu::surf::c_nIntervals",        nIntervals);
-            uploadConstant("cv::gpu::surf::c_mask_width",        mask_width);
-            uploadConstant("cv::gpu::surf::c_mask_height",       mask_height);
-            uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset);
-            uploadConstant("cv::gpu::surf::c_dxy_half_width",    dxy_half_width);
-            uploadConstant("cv::gpu::surf::c_dxy_scale",         dxy_scale);
-            uploadConstant("cv::gpu::surf::c_initialScale",      initialScale);
-            uploadConstant("cv::gpu::surf::c_threshold",         threshold);
-            
-            hessianBuffer.create(height0 * nIntervals, width0, CV_32F);
+            bindTexture("cv::gpu::surf::imgTex", (DevMem2D)img);
 
-            integral(img, sum);
-            sum.convertTo(sumf, CV_32F, 1.0 / 255.0);
-            
-            bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sumf);
+            integralBuffered(img, sum, intBuffer);
+            bindTexture("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);
 
-            if (!mask.empty())
-		    {
+            if (use_mask)
+            {
                 min(mask, 1.0, mask1);
-                integral(mask1, maskSum);
-            
-                bindTexture("cv::gpu::surf::maskSumTex", (DevMem2Di)maskSum);
-		    }
+                integralBuffered(mask1, maskSum, intBuffer);
+
+                bindTexture("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
+            }
         }
 
         ~SURF_GPU_Invoker()
         {
+            cudaSafeCall( cudaFree(d_counters) );
+
+            unbindTexture("cv::gpu::surf::imgTex");
             unbindTexture("cv::gpu::surf::sumTex");
             if (use_mask)
                 unbindTexture("cv::gpu::surf::maskSumTex");
@@ -172,102 +142,115 @@ namespace
 
         void detectKeypoints(GpuMat& keypoints)
         {
-            typedef void (*fasthessian_t)(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);
-            const fasthessian_t fasthessian = 
-                DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? fasthessian_gpu : fasthessian_gpu_old;
+            ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, det);
+            ensureSizeIsEnough(img_rows * (nOctaveLayers + 2), img_cols, CV_32FC1, trace);
+            
+            ensureSizeIsEnough(1, maxCandidates, CV_32SC4, maxPosBuffer);
+            ensureSizeIsEnough(1, maxFeatures, CV_32FC(6), featuresBuffer);
 
-            dim3 threads = calcBlockSize(nIntervals);
-            for(int octave = 0; octave < nOctaves; ++octave)
+            for (int octave = 0; octave < nOctaves; ++octave)
             {
-                int step = initialStep * (1 << octave);
-
-                // Compute border required such that the filters don't overstep the image boundaries
-                float d = (initialScale * (1 << octave)) / (nIntervals - 2);
-	            float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f;
-                int border = static_cast<int>(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
-                
-                int x_size = (img_cols - 2 * border) / step;
-                int y_size = (img_rows - 2 * border) / step;
-                
-                if (x_size <= 0 || y_size <= 0)
-                    break;
-
-                uploadConstant("cv::gpu::surf::c_octave", octave);
-                uploadConstant("cv::gpu::surf::c_x_size", x_size);
-                uploadConstant("cv::gpu::surf::c_y_size", y_size);
-                uploadConstant("cv::gpu::surf::c_border", border);
-                uploadConstant("cv::gpu::surf::c_step",   step);
-
-                fasthessian(hessianBuffer, x_size, y_size, threads);
-
-                // Reset the candidate count.
-                maxCounter = 0;
-
-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); 
-                
-                maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
+                const int layer_rows = img_rows >> octave;
+                const int layer_cols = img_cols >> octave;
+
+                uploadConstant("cv::gpu::surf::c_octave",     octave);
+                uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);
+                uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);
+
+                icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);
+
+                icvFindMaximaInLayer_gpu(det, trace, maxPosBuffer.ptr<int4>(), d_counters + 2 + octave,
+                    img_rows, img_cols, octave, use_mask, nOctaveLayers);
+
+                unsigned int maxCounter;
+                cudaSafeCall( cudaMemcpy(&maxCounter, d_counters + 2 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+                maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
 
                 if (maxCounter > 0)
                 {
-                    fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
-                        featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
-
-                    featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
+                    icvInterpolateKeypoint_gpu(det, maxPosBuffer.ptr<int4>(), maxCounter, 
+                        featuresBuffer.ptr<KeyPoint_GPU>(), d_counters);
                 }
             }
+            unsigned int featureCounter;
+            cudaSafeCall( cudaMemcpy(&featureCounter, d_counters, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
 
-            if (featureCounter > 0)
-                featuresBuffer.colRange(0, featureCounter).copyTo(keypoints);
-            else
-                keypoints.release();
+            findOrientation(featuresBuffer.colRange(0, featureCounter), keypoints);
         }
 
-        void findOrientation(GpuMat& keypoints)
+        void findOrientation(const GpuMat& features, GpuMat& keypoints)
         {
-            if (keypoints.cols > 0)
-                find_orientation_gpu(keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
+            if (features.cols > 0)
+            {
+                ensureSizeIsEnough(1, maxKeypoints, CV_32FC(6), keypointsBuffer);
+
+                icvCalcOrientation_gpu(features.ptr<KeyPoint_GPU>(), features.cols, keypointsBuffer.ptr<KeyPoint_GPU>(), 
+                    d_counters + 1);
+
+                unsigned int keypointsCounter;
+                cudaSafeCall( cudaMemcpy(&keypointsCounter, d_counters + 1, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+                keypointsCounter = std::min(keypointsCounter, static_cast<unsigned int>(maxKeypoints));
+
+                if (keypointsCounter > 0)
+                    keypointsBuffer.colRange(0, keypointsCounter).copyTo(keypoints);
+                else
+                    keypoints.release();
+            }
         }
 
         void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
         {
-            typedef void (*compute_descriptors_t)(const DevMem2Df& descriptors, 
-                const KeyPoint_GPU* features, int nFeatures);
-
-            const compute_descriptors_t compute_descriptors = compute_descriptors_gpu_old;
-                //DeviceInfo().supports(FEATURE_SET_COMPUTE_13) ? compute_descriptors_gpu : compute_descriptors_gpu_old;
-
             if (keypoints.cols > 0)
             {
                 descriptors.create(keypoints.cols, descriptorSize, CV_32F);
-                compute_descriptors(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
+                compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
             }
         }
 
     private:
         GpuMat& sum;
-        GpuMat& sumf;
-
         GpuMat& mask1;
         GpuMat& maskSum;
+        GpuMat& intBuffer;
+
+        GpuMat& det;
+        GpuMat& trace;
 
-        GpuMat& hessianBuffer;
         GpuMat& maxPosBuffer;
         GpuMat& featuresBuffer;
+        GpuMat& keypointsBuffer;
 
         int img_cols, img_rows;
 
         bool use_mask;
-        
-        float mask_width, mask_height;
 
-        unsigned int featureCounter;
-        unsigned int maxCounter;
+        int maxCandidates;
+        int maxFeatures;
+        int maxKeypoints;
 
-        int max_candidates;
-        int max_features;
+        unsigned int* d_counters;
     };
 }
 
+cv::gpu::SURF_GPU::SURF_GPU()
+{
+    hessianThreshold = 100;
+    extended = 1;
+    nOctaves = 4;
+    nOctaveLayers = 2;
+    keypointsRatio = 0.01f;
+}
+
+cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio)
+{
+    hessianThreshold = _threshold;
+    extended = _extended;
+    nOctaves = _nOctaves;
+    nOctaveLayers = _nOctaveLayers;
+    keypointsRatio = _keypointsRatio;
+}
+
 int cv::gpu::SURF_GPU::descriptorSize() const
 {
     return extended ? 128 : 64;
@@ -281,27 +264,64 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
     {
         Mat keypointsCPU(1, keypoints.size(), CV_32FC(6));
 
-        const KeyPoint* keypoints_ptr = &keypoints[0];
-        KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
-        for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
+        for (size_t i = 0; i < keypoints.size(); ++i)
         {
-            const KeyPoint& kp = *keypoints_ptr;
-            KeyPoint_GPU& gkp = *keypointsCPU_ptr;
+            const KeyPoint& kp = keypoints[i];
+            KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
 
             gkp.x = kp.pt.x;
             gkp.y = kp.pt.y;
 
+            gkp.laplacian = 1.0f;
+
             gkp.size = kp.size;
 
-            gkp.octave = static_cast<float>(kp.octave);
-            gkp.angle = kp.angle;
-            gkp.response = kp.response;
+            gkp.dir = kp.angle;
+            gkp.hessian = kp.response;
         }
 
         keypointsGPU.upload(keypointsCPU);
     }
 }
 
+namespace
+{
+    int calcSize(int octave, int layer)
+    {
+        /* Wavelet size at first layer of first octave. */
+        const int HAAR_SIZE0 = 9;
+
+        /* Wavelet size increment between layers. This should be an even number,
+         such that the wavelet sizes in an octave are either all even or all odd.
+         This ensures that when looking for the neighbours of a sample, the layers
+         above and below are aligned correctly. */
+        const int HAAR_SIZE_INC = 6;
+
+        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+    }
+
+    int getPointOctave(const KeyPoint_GPU& kpt, const CvSURFParams& params)
+    {
+        int best_octave = 0;
+        float min_diff = numeric_limits<float>::max();
+        for (int octave = 1; octave < params.nOctaves; ++octave)
+        {
+            for (int layer = 0; layer < params.nOctaveLayers; ++layer)
+            {
+                float diff = std::abs(kpt.size - (float)calcSize(octave, layer));
+                if (min_diff > diff)
+                {
+                    min_diff = diff;
+                    best_octave = octave;
+                    if (min_diff == 0)
+                        return best_octave;
+                }
+            }
+        }
+        return best_octave;
+    }
+}
+
 void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)
 {
     if (keypointsGPU.empty())
@@ -313,21 +333,23 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
         Mat keypointsCPU = keypointsGPU;
         keypoints.resize(keypointsGPU.cols);
 
-        KeyPoint* keypoints_ptr = &keypoints[0];
-        const KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
-        for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
+        for (int i = 0; i < keypointsGPU.cols; ++i)
         {
-            KeyPoint& kp = *keypoints_ptr;
-            const KeyPoint_GPU& gkp = *keypointsCPU_ptr;
+            KeyPoint& kp = keypoints[i];
+            const KeyPoint_GPU& gkp = keypointsCPU.ptr<KeyPoint_GPU>()[i];
 
             kp.pt.x = gkp.x;
             kp.pt.y = gkp.y;
 
             kp.size = gkp.size;
 
-            kp.octave = static_cast<int>(gkp.octave);
-            kp.angle = gkp.angle;
-            kp.response = gkp.response;
+            kp.angle = gkp.dir;
+
+            kp.response = gkp.hessian;
+
+            kp.octave = getPointOctave(gkp, *this);
+
+            kp.class_id = static_cast<int>(gkp.laplacian);
         }
     }
 }
@@ -353,23 +375,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
         SURF_GPU_Invoker surf(*this, img, mask);
 
         surf.detectKeypoints(keypoints);
-
-        surf.findOrientation(keypoints);
     }
 }
 
 void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, 
-                                   bool useProvidedKeypoints, bool calcOrientation)
+                                   bool useProvidedKeypoints)
 {
     if (!img.empty())
     {
         SURF_GPU_Invoker surf(*this, img, mask);
-        
+    
         if (!useProvidedKeypoints)
             surf.detectKeypoints(keypoints);
-        
-        if (calcOrientation)
-            surf.findOrientation(keypoints);
+        else
+        {
+            GpuMat keypointsBuf;
+            surf.findOrientation(keypoints, keypointsBuf);
+            keypointsBuf.copyTo(keypoints);
+        }
 
         surf.computeDescriptors(keypoints, descriptors, descriptorSize());
     }
@@ -385,24 +408,24 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
 }
 
 void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, 
-    GpuMat& descriptors, bool useProvidedKeypoints, bool calcOrientation)
+    GpuMat& descriptors, bool useProvidedKeypoints)
 {
     GpuMat keypointsGPU;
 
     if (useProvidedKeypoints)
         uploadKeypoints(keypoints, keypointsGPU);    
 
-    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation);
+    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
 
     downloadKeypoints(keypointsGPU, keypoints);
 }
 
 void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, 
-    vector<float>& descriptors, bool useProvidedKeypoints, bool calcOrientation)
+    vector<float>& descriptors, bool useProvidedKeypoints)
 {
     GpuMat descriptorsGPU;
 
-    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation);
+    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
 
     downloadDescriptors(descriptorsGPU, descriptors);
 }
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index c0f254e..12f7196 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -48,7 +48,6 @@ using namespace std;
 
 const string FEATURES2D_DIR = "features2d";
 const string IMAGE_FILENAME = "aloe.png";
-const string VALID_FILE_NAME = "surf.xml.gz";
 
 class CV_GPU_SURFTest : public cvtest::BaseTest
 {
@@ -59,17 +58,20 @@ public:
 
 protected:
     bool isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2);
+    int getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2, const vector<DMatch>& matches);
     void compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints,
                              const Mat& validDescriptors, const Mat& calcDescriptors);
 
-    void emptyDataTest(SURF_GPU& fdetector);
-    void regressionTest(SURF_GPU& fdetector);
+    void emptyDataTest();
+    void accuracyTest();
 
     virtual void run(int);
 };
 
-void CV_GPU_SURFTest::emptyDataTest(SURF_GPU& fdetector)
+void CV_GPU_SURFTest::emptyDataTest()
 {
+    SURF_GPU fdetector;
+
     GpuMat image;
     vector<KeyPoint> keypoints;
     vector<float> descriptors;
@@ -114,116 +116,80 @@ bool CV_GPU_SURFTest::isSimilarKeypoints(const KeyPoint& p1, const KeyPoint& p2)
             p1.class_id == p2.class_id );
 }
 
-void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints, 
-                                          const Mat& validDescriptors, const Mat& calcDescriptors)
+int CV_GPU_SURFTest::getValidCount(const vector<KeyPoint>& keypoints1, const vector<KeyPoint>& keypoints2,
+                     const vector<DMatch>& matches)
 {
-    if (validKeypoints.size() != calcKeypoints.size())
+    int count = 0;
+
+    for (size_t i = 0; i < matches.size(); ++i)
     {
-        ts->printf(cvtest::TS::LOG, "Keypoints sizes doesn't equal (validCount = %d, calcCount = %d).\n",
-                   validKeypoints.size(), calcKeypoints.size());
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
-        return;
+        const DMatch& m = matches[i];
+
+        const KeyPoint& kp1 = keypoints1[m.queryIdx];
+        const KeyPoint& kp2 = keypoints2[m.trainIdx];
+
+        if (isSimilarKeypoints(kp1, kp2))
+            ++count;
     }
-    if (validDescriptors.size() != calcDescriptors.size())
+
+    return count;
+}
+
+void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints, const vector<KeyPoint>& calcKeypoints, 
+                                          const Mat& validDescriptors, const Mat& calcDescriptors)
+{
+    BruteForceMatcher< L2<float> > matcher;
+    vector<DMatch> matches;
+
+    matcher.match(validDescriptors, calcDescriptors, matches);
+
+    int validCount = getValidCount(validKeypoints, calcKeypoints, matches);
+    float validRatio = (float)validCount / matches.size();
+
+    if (validRatio < 0.5f)
     {
-        ts->printf(cvtest::TS::LOG, "Descriptors sizes doesn't equal.\n");
-        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        ts->printf(cvtest::TS::LOG, "Bad accuracy - %f.\n", validRatio);
+        ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
         return;
     }
-    for (size_t v = 0; v < validKeypoints.size(); v++)
-    {
-        int nearestIdx = -1;
-        float minDist = std::numeric_limits<float>::max();
-
-        for (size_t c = 0; c < calcKeypoints.size(); c++)
-        {
-            float curDist = (float)norm(calcKeypoints[c].pt - validKeypoints[v].pt);
-            if (curDist < minDist)
-            {
-                minDist = curDist;
-                nearestIdx = c;
-            }
-        }
-
-        assert(minDist >= 0);
-        if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))
-        {
-            ts->printf(cvtest::TS::LOG, "Bad keypoints accuracy.\n");
-            ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
-            return;
-        }
-
-        if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.5f)
-        {
-            ts->printf(cvtest::TS::LOG, "Bad descriptors accuracy.\n");
-            ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
-            return;
-        }
-    }
 }
 
-void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)
+void CV_GPU_SURFTest::accuracyTest()
 {
     string imgFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + IMAGE_FILENAME;
-    string resFilename = string(ts->get_data_path()) + FEATURES2D_DIR + "/" + VALID_FILE_NAME;
 
     // Read the test image.
-    GpuMat image(imread(imgFilename, 0));
+    Mat image = imread(imgFilename, 0);
     if (image.empty())
     {
         ts->printf( cvtest::TS::LOG, "Image %s can not be read.\n", imgFilename.c_str() );
         ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_TEST_DATA );
         return;
     }
-
-    FileStorage fs(resFilename, FileStorage::READ);
+    
+    Mat mask(image.size(), CV_8UC1, Scalar::all(1));
+    mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));
 
     // Compute keypoints.
-    GpuMat mask(image.size(), CV_8UC1, Scalar::all(1));
-    mask(Range(0, image.rows / 2), Range(0, image.cols / 2)).setTo(Scalar::all(0));
     vector<KeyPoint> calcKeypoints;
-    GpuMat calcDespcriptors;
-    fdetector(image, mask, calcKeypoints, calcDespcriptors);
-
-    if (fs.isOpened()) // Compare computed and valid keypoints.
-    {
-        // Read validation keypoints set.
-        vector<KeyPoint> validKeypoints;
-        Mat validDespcriptors;
-        read(fs["keypoints"], validKeypoints);
-        read(fs["descriptors"], validDespcriptors);
-        if (validKeypoints.empty() || validDespcriptors.empty())
-        {
-            ts->printf(cvtest::TS::LOG, "Validation file can not be read.\n");
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-            return;
-        }
-
-        compareKeypointSets(validKeypoints, calcKeypoints, validDespcriptors, calcDespcriptors);
-    }
-    else // Write detector parameters and computed keypoints as validation data.
-    {
-        fs.open(resFilename, FileStorage::WRITE);
-        if (!fs.isOpened())
-        {
-            ts->printf(cvtest::TS::LOG, "File %s can not be opened to write.\n", resFilename.c_str());
-            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
-            return;
-        }
-        else
-        {
-            write(fs, "keypoints", calcKeypoints);
-            write(fs, "descriptors", (Mat)calcDespcriptors);
-        }
-    }
+    GpuMat calcDescriptors;
+    SURF_GPU fdetector; fdetector.extended = false;
+    fdetector(GpuMat(image), GpuMat(mask), calcKeypoints, calcDescriptors);
+
+    // Calc validation keypoints set.
+    vector<KeyPoint> validKeypoints;
+    vector<float> validDescriptors;
+    SURF fdetector_gold; fdetector_gold.extended = false;
+    fdetector_gold(image, mask, validKeypoints, validDescriptors);
+
+    compareKeypointSets(validKeypoints, calcKeypoints, 
+        Mat(validKeypoints.size(), fdetector_gold.descriptorSize(), CV_32F, &validDescriptors[0]), calcDescriptors);
 }
 
 void CV_GPU_SURFTest::run( int /*start_from*/ )
 {
-    SURF_GPU fdetector;
-
-    emptyDataTest(fdetector);
-    regressionTest(fdetector);
+    emptyDataTest();
+    accuracyTest();
 }
 
-TEST(SURF, empty_data_and_regression) { CV_GPU_SURFTest test; test.safe_run(); }
+TEST(SURF, empty_data_and_accuracy) { CV_GPU_SURFTest test; test.safe_run(); }
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 34af5ae..18b45f2 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -264,10 +264,11 @@ TEST(SURF)
 
     SURF surf;
     vector<KeyPoint> keypoints1, keypoints2;
+    vector<float> descriptors1, descriptors2;
 
     CPU_ON;
-    surf(src1, Mat(), keypoints1);
-    surf(src2, Mat(), keypoints2);
+    surf(src1, Mat(), keypoints1, descriptors1);
+    surf(src2, Mat(), keypoints2, descriptors2);
     CPU_OFF;
 
     gpu::SURF_GPU d_surf;
@@ -275,8 +276,8 @@ TEST(SURF)
     gpu::GpuMat d_descriptors1, d_descriptors2;
 
     GPU_ON;
-    d_surf(d_src1, gpu::GpuMat(), d_keypoints1);
-    d_surf(d_src2, gpu::GpuMat(), d_keypoints2);
+    d_surf(d_src1, gpu::GpuMat(), d_keypoints1, d_descriptors1);
+    d_surf(d_src2, gpu::GpuMat(), d_keypoints2, d_descriptors2);
     GPU_OFF;
 }
 
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp
index b2c9385..eb8da5f 100644
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
     vector<KeyPoint> keypoints1, keypoints2;
     vector<float> descriptors1, descriptors2;
     vector<DMatch> matches;
-    SURF_GPU::downloadKeypoints(keypoints1GPU, keypoints1);
-    SURF_GPU::downloadKeypoints(keypoints2GPU, keypoints2);
-    SURF_GPU::downloadDescriptors(descriptors1GPU, descriptors1);
-    SURF_GPU::downloadDescriptors(descriptors2GPU, descriptors2);
+    surf.downloadKeypoints(keypoints1GPU, keypoints1);
+    surf.downloadKeypoints(keypoints2GPU, keypoints2);
+    surf.downloadDescriptors(descriptors1GPU, descriptors1);
+    surf.downloadDescriptors(descriptors2GPU, descriptors2);
     BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);
 
     // drawing the results