From 8891acb67a12c50d42750b65ff32b1aa0cb26d4e Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 6 Dec 2010 12:06:51 +0000
Subject: [PATCH] added BruteForceMatcher_GPU

---
 modules/gpu/CMakeLists.txt                  |    2 +-
 modules/gpu/include/opencv2/gpu/gpu.hpp     |  148 +++-
 modules/gpu/src/brute_force_matcher.cpp     |  605 ++++++++++++++
 modules/gpu/src/cuda/brute_force_matcher.cu | 1205 +++++++++++++++++++++++++++
 tests/gpu/src/brute_force_matcher.cpp       |  175 ++++
 tests/gpu/src/gputest.hpp                   |    3 +-
 6 files changed, 2135 insertions(+), 3 deletions(-)
 create mode 100644 modules/gpu/src/brute_force_matcher.cpp
 create mode 100644 modules/gpu/src/cuda/brute_force_matcher.cu
 create mode 100644 tests/gpu/src/brute_force_matcher.cpp
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 8181c3d..f41817b 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 set(name "gpu")
-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect")
+set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann")
 
 set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
 
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index a8d3594..53a1a74 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -48,6 +48,7 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/objdetect/objdetect.hpp"
 #include "opencv2/gpu/devmem2d.hpp"
+#include "opencv2/features2d/features2d.hpp"
 
 namespace cv
 {
@@ -1118,7 +1119,152 @@ namespace cv
 
             // Gradients conputation results
             GpuMat grad, qangle;            
-        };    
+        };
+
+
+        ////////////////////////////////// BruteForceMatcher //////////////////////////////////
+
+        class CV_EXPORTS BruteForceMatcher_GPU_base
+        {
+        public:
+            enum DistType {L1Dist = 0, L2Dist};
+
+            explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);
+
+            // Add descriptors to train descriptor collection.
+            void add(const std::vector<GpuMat>& descCollection);
+
+            // Get train descriptors collection.
+            const std::vector<GpuMat>& getTrainDescriptors() const;
+
+            // Clear train descriptors collection.
+            void clear();
+
+            // Return true if there are not train descriptors in collection.
+            bool empty() const;
+
+            // Return true if the matcher supports mask in match methods.
+            bool isMaskSupported() const;
+
+            // Find one best match for each query descriptor.
+            // trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
+            // distance.at<float>(0, queryIdx) will contain distance
+            void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+                GpuMat& trainIdx, GpuMat& distance,
+                const GpuMat& mask = GpuMat());
+
+            // Download trainIdx and distance to CPU vector with DMatch
+            static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
+
+            // Find one best match for each query descriptor.
+            void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector<DMatch>& matches, 
+                const GpuMat& mask = GpuMat());
+
+            // Make gpu collection of trains and masks in suitable format for matchCollection function
+            void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
+                const vector<GpuMat>& masks = std::vector<GpuMat>());
+
+            // Find one best match from train collection for each query descriptor.
+            // trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
+            // imgIdx.at<int>(0, queryIdx) will contain best image index for queryIdx
+            // distance.at<float>(0, queryIdx) will contain distance
+            void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, 
+                GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, 
+                const GpuMat& maskCollection);
+
+            // Download trainIdx, imgIdx and distance to CPU vector with DMatch
+            static void matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, const GpuMat& distance, 
+                std::vector<DMatch>& matches);
+
+            // Find one best match from train collection for each query descriptor.
+            void match(const GpuMat& queryDescs, std::vector<DMatch>& matches, 
+                const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+
+            // Find k best matches for each query descriptor (in increasing order of distances).
+            // trainIdx.at<int>(queryIdx, i) will contain index of i'th best trains (i < k).
+            // distance.at<float>(queryIdx, i) will contain distance.
+            // allDist is a buffer to store all distance between query descriptors and train descriptors
+            // it have size (nQuery,nTrain) and CV_32F type
+            // allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best, 
+            // otherwise it will contain distance between queryIdx and trainIdx descriptors
+            void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+                GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat());
+
+            // Download trainIdx and distance to CPU vector with DMatch
+            // compactResult is used when mask is not empty. If compactResult is false matches
+            // vector will have the same size as queryDescriptors rows. If compactResult is true
+            // matches vector will not contain matches for fully masked out query descriptors.
+            static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
+                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+
+            // Find k best matches for each query descriptor (in increasing order of distances).
+            // compactResult is used when mask is not empty. If compactResult is false matches
+            // vector will have the same size as queryDescriptors rows. If compactResult is true
+            // matches vector will not contain matches for fully masked out query descriptors.
+            void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+                std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(), 
+                bool compactResult = false);            
+
+            // Find k best matches  for each query descriptor (in increasing order of distances).
+            // compactResult is used when mask is not empty. If compactResult is false matches
+            // vector will have the same size as queryDescriptors rows. If compactResult is true
+            // matches vector will not contain matches for fully masked out query descriptors.
+            void knnMatch(const GpuMat& queryDescs, std::vector< std::vector<DMatch> >& matches, int knn, 
+                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false );
+
+            // Find best matches for each query descriptor which have distance less than maxDistance.
+            // nMatches.at<unsigned int>(0, queruIdx) will contain matches count for queryIdx.
+            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
+            // because it didn't have enough memory.
+            // trainIdx.at<int>(queruIdx, i) will contain ith train index (i < min(nMatches.at<unsigned int>(0, queruIdx), trainIdx.cols))
+            // distance.at<int>(queruIdx, i) will contain ith distance (i < min(nMatches.at<unsigned int>(0, queruIdx), trainIdx.cols))
+            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x nTrain,
+            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+            // Matches doesn't sorted.
+            void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+                GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, 
+                const GpuMat& mask = GpuMat());
+
+            // Download trainIdx, nMatches and distance to CPU vector with DMatch.
+            // matches will be sorted in increasing order of distances.
+            // compactResult is used when mask is not empty. If compactResult is false matches
+            // vector will have the same size as queryDescriptors rows. If compactResult is true
+            // matches vector will not contain matches for fully masked out query descriptors.
+            static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance,
+                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+
+            // Find best matches for each query descriptor which have distance less than maxDistance 
+            // in increasing order of distances).
+            void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+                std::vector< std::vector<DMatch> >& matches, float maxDistance, 
+                const GpuMat& mask = GpuMat(), bool compactResult = false);
+
+            // Find best matches from train collection for each query descriptor which have distance less than
+            // maxDistance (in increasing order of distances).
+            void radiusMatch(const GpuMat& queryDescs, std::vector< std::vector<DMatch> >& matches, float maxDistance, 
+                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);        
+        
+        private:
+            DistType distType;
+
+            std::vector<GpuMat> trainDescCollection;
+        };
+
+        template <class Distance>
+        class CV_EXPORTS BruteForceMatcher_GPU;
+
+        template <typename T>
+        class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BruteForceMatcher_GPU_base
+        {
+        public:
+            explicit BruteForceMatcher_GPU(L1<T> d = L1<T>()) : BruteForceMatcher_GPU_base(L1Dist) {}
+        };
+        template <typename T>
+        class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BruteForceMatcher_GPU_base
+        {
+        public:
+            explicit BruteForceMatcher_GPU(L2<T> d = L2<T>()) : BruteForceMatcher_GPU_base(L2Dist) {}
+        };
     }
 
 
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
new file mode 100644
index 0000000..1dd3c0a
--- /dev/null
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -0,0 +1,605 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA)
+
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>&) { throw_nogpu(); }
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }
+void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); }
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; }
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace bfmatcher
+{
+    template <typename T>
+    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template <typename T>
+    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template <typename T>
+    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, 
+        const DevMem2Df& distance);
+    template <typename T>
+    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, 
+        const DevMem2Df& distance);
+
+    template <typename T>
+    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template <typename T>
+    void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+
+    template <typename T>
+    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template <typename T>
+    void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+}}}
+
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
+{
+}
+
+////////////////////////////////////////////////////////////////////
+// Train collection
+
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)
+{
+    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
+}
+
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const 
+{
+    return trainDescCollection;
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::clear() 
+{
+    trainDescCollection.clear();
+}
+
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const
+{
+    return trainDescCollection.empty();
+}
+
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
+{
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////
+// Match
+
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+    GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask)
+{
+    using namespace cv::gpu::bfmatcher;
+
+    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+
+    static const match_caller_t match_callers[2][8] = 
+    {
+        {
+            matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>, 
+            matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
+        },
+        {
+            matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>, 
+            matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
+        }
+    };
+    
+    CV_Assert(queryDescs.channels() == 1);
+    CV_Assert(trainDescs.cols == queryDescs.cols && trainDescs.type() == queryDescs.type());
+    
+    const int nQuery = queryDescs.rows;
+
+    trainIdx.create(1, nQuery, CV_32S);
+    distance.create(1, nQuery, CV_32F);
+
+    match_caller_t func = match_callers[distType][queryDescs.depth()];
+    CV_Assert(func != 0);
+
+    // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. 
+    // trainIdx store after imgIdx, so we doesn't lose it value.
+    func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, 
+    vector<DMatch>& matches)
+{
+    const int nQuery = trainIdx.cols;
+
+    Mat trainIdxCPU = trainIdx;
+    Mat distanceCPU = distance;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    const int* trainIdx_ptr = trainIdxCPU.ptr<int>();
+    const float* distance_ptr =  distanceCPU.ptr<float>();
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
+    {
+        int trainIdx = *trainIdx_ptr;
+        if (trainIdx == -1)
+            continue;
+
+        float distance = *distance_ptr;
+
+        DMatch m(queryIdx, trainIdx, 0, distance);
+
+        matches.push_back(m);
+    }
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+    vector<DMatch>& matches, const GpuMat& mask)
+{
+    GpuMat trainIdx, distance;
+    matchSingle(queryDescs, trainDescs, trainIdx, distance, mask);
+    matchDownload(trainIdx, distance, matches);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, 
+    const vector<GpuMat>& masks)
+{
+    if (masks.empty())
+    {        
+        Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));
+
+        for (size_t i = 0; i < trainDescCollection.size(); ++i)
+        {
+            const GpuMat& trainDescs = trainDescCollection[i];
+
+            trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;
+        }
+
+        trainCollection.upload(trainCollectionCPU);
+    }
+    else
+    {
+        CV_Assert(masks.size() == trainDescCollection.size());
+        
+        Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));
+        Mat maskCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(PtrStep)));
+
+        for (size_t i = 0; i < trainDescCollection.size(); ++i)
+        {
+            const GpuMat& trainDescs = trainDescCollection[i];
+            const GpuMat& mask = masks[i];
+
+            CV_Assert(mask.empty() || (mask.type() == CV_8UC1));
+
+            trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;
+
+            maskCollectionCPU.ptr<PtrStep>(0)[i] = static_cast<PtrStep>(mask);
+        }
+
+        trainCollection.upload(trainCollectionCPU);
+        maskCollection.upload(maskCollectionCPU);
+    }
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, 
+    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection)
+{
+    using namespace cv::gpu::bfmatcher;
+
+    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, 
+        const DevMem2Df& distance);
+
+    static const match_caller_t match_callers[2][8] = 
+    {
+        {
+            matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>, 
+            matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>, 
+            matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
+        },
+        {
+            matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>, 
+            matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>, 
+            matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
+        }
+    };
+    
+    CV_Assert(queryDescs.channels() == 1);
+    
+    const int nQuery = queryDescs.rows;
+
+    trainIdx.create(1, nQuery, CV_32S);
+    imgIdx.create(1, nQuery, CV_32S);
+    distance.create(1, nQuery, CV_32F);
+
+    match_caller_t func = match_callers[distType][queryDescs.depth()];
+    CV_Assert(func != 0);
+
+    func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, 
+    const GpuMat& distance, vector<DMatch>& matches)
+{
+    const int nQuery = trainIdx.cols;
+
+    Mat trainIdxCPU = trainIdx;
+    Mat imgIdxCPU = imgIdx;
+    Mat distanceCPU = distance;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    const int* trainIdx_ptr = trainIdxCPU.ptr<int>();
+    const int* imgIdx_ptr = imgIdxCPU.ptr<int>();
+    const float* distance_ptr =  distanceCPU.ptr<float>();
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+    {
+        int trainIdx = *trainIdx_ptr;
+        if (trainIdx == -1)
+            continue;
+
+        int imgIdx = *imgIdx_ptr;
+
+        float distance = *distance_ptr;
+
+        DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+        matches.push_back(m);
+    }
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector<DMatch>& matches, 
+    const vector<GpuMat>& masks)
+{    
+    GpuMat trainCollection;
+    GpuMat maskCollection;
+
+    makeGpuCollection(trainCollection, maskCollection, masks);
+
+    GpuMat trainIdx, imgIdx, distance;
+
+    matchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, maskCollection);
+    matchDownload(trainIdx, imgIdx, distance, matches);
+}
+
+////////////////////////////////////////////////////////////////////
+// KnnMatch
+
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask)
+{
+    using namespace cv::gpu::bfmatcher;
+
+    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+
+    static const match_caller_t match_callers[2][8] = 
+    {
+        {
+            knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>, 
+            knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
+        },
+        {
+            knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>, 
+            knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
+        }
+    };
+    
+    CV_Assert(queryDescs.channels() == 1);
+    
+    const int nQuery = queryDescs.rows;
+    const int nTrain = trainDescs.rows;
+
+    trainIdx.create(nQuery, k, CV_32S);
+    trainIdx.setTo(Scalar::all(-1));
+    distance.create(nQuery, k, CV_32F);
+
+    allDist.create(nQuery, nTrain, CV_32F);
+
+    match_caller_t func = match_callers[distType][queryDescs.depth()];
+    CV_Assert(func != 0);
+
+    func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
+    vector< vector<DMatch> >& matches, bool compactResult)
+{
+    const int nQuery = distance.rows;
+    const int k = trainIdx.cols;
+
+    Mat trainIdxCPU = trainIdx;
+    Mat distanceCPU = distance;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+    {
+        matches.push_back(vector<DMatch>());
+        vector<DMatch>& curMatches = matches.back();
+        curMatches.reserve(k);
+
+        int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);
+        float* distance_ptr = distanceCPU.ptr<float>(queryIdx);
+        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
+        {
+            int trainIdx = *trainIdx_ptr;
+
+            if (trainIdx != -1)
+            {
+                float distance = *distance_ptr;
+
+                DMatch m(queryIdx, trainIdx, 0, distance);
+
+                curMatches.push_back(m);
+            }
+        }
+
+        if (compactResult && curMatches.empty())
+            matches.pop_back();
+    }
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+    vector< vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)
+{
+    GpuMat trainIdx, distance, allDist;
+    knnMatch(queryDescs, trainDescs, trainIdx, distance, allDist, k, mask);
+    knnMatchDownload(trainIdx, distance, matches, compactResult);
+}
+
+namespace
+{
+    class ImgIdxSetter
+    {
+    public:
+        ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
+        void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
+    private:
+        int imgIdx;
+    };
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, 
+    vector< vector<DMatch> >& matches, int knn, const vector<GpuMat>& masks, bool compactResult)
+{
+    vector< vector<DMatch> > curMatches;
+    vector<DMatch> temp;
+    temp.reserve(2 * knn);
+
+    matches.resize(queryDescs.rows);
+    for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector<DMatch>::reserve), knn));
+
+    for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)
+    {
+        knnMatch(queryDescs, trainDescCollection[imgIdx], curMatches, knn, 
+            masks.empty() ? GpuMat() : masks[imgIdx]);
+
+        for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)
+        {
+            vector<DMatch>& localMatch = curMatches[queryIdx];
+            vector<DMatch>& globalMatch = matches[queryIdx];
+
+            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
+
+            temp.clear();
+            merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
+
+            globalMatch.clear();
+            const size_t count = std::min((size_t)knn, temp.size());
+            copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));
+        }
+    }
+
+    if (compactResult)
+    {
+        vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), 
+            mem_fun_ref(&vector<DMatch>::empty));
+        matches.erase(new_end, matches.end());
+    }
+}
+
+////////////////////////////////////////////////////////////////////
+// RadiusMatch
+
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+    GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask)
+{
+    using namespace cv::gpu::bfmatcher;
+
+    typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+
+    static const radiusMatch_caller_t radiusMatch_callers[2][8] = 
+    {
+        {
+            radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>, 
+            radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
+        },
+        {
+            radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>, 
+            radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
+        }
+    };
+    
+    const int nQuery = queryDescs.rows;
+    const int nTrain = trainDescs.rows;
+    
+    CV_Assert(queryDescs.channels() == 1);
+    CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols);
+    CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery);    
+    
+    nMatches.create(1, nQuery, CV_32SC1);
+    nMatches.setTo(Scalar::all(0));
+    if (trainIdx.empty())
+    {
+        trainIdx.create(nQuery, nTrain, CV_32SC1);
+        distance.create(nQuery, nTrain, CV_32FC1);
+    }
+
+    radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];
+    CV_Assert(func != 0);
+
+    func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, 
+    const GpuMat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult)
+{
+    const int nQuery = trainIdx.rows;
+
+    Mat trainIdxCPU = trainIdx;
+    Mat nMatchesCPU = nMatches;
+    Mat distanceCPU = distance;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    const unsigned int* nMatches_ptr = nMatchesCPU.ptr<unsigned int>();
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+    {
+        const int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);
+        const float* distance_ptr = distanceCPU.ptr<float>(queryIdx);
+
+        const int nMatches = std::min(static_cast<int>(nMatches_ptr[queryIdx]), trainIdx.cols);
+
+        if (nMatches == 0)
+        {
+            if (!compactResult)
+                matches.push_back(vector<DMatch>());
+            continue;
+        }
+
+        matches.push_back(vector<DMatch>());
+        vector<DMatch>& curMatches = matches.back();
+        curMatches.reserve(nMatches);
+
+        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        {
+            int trainIdx = *trainIdx_ptr;
+
+            float distance = *distance_ptr;
+            
+            DMatch m(queryIdx, trainIdx, 0, distance);
+
+            curMatches.push_back(m);
+        }
+        sort(curMatches.begin(), curMatches.end());
+    }
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, 
+    vector< vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)
+{
+    GpuMat trainIdx, nMatches, distance;
+    radiusMatch(queryDescs, trainDescs, trainIdx, nMatches, distance, maxDistance, mask);
+    radiusMatchDownload(trainIdx, nMatches, distance, matches, compactResult);
+}
+
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector<DMatch> >& matches, 
+                float maxDistance, const vector<GpuMat>& masks, bool compactResult)
+
+{
+    matches.resize(queryDescs.rows);
+
+    vector< vector<DMatch> > curMatches;
+
+    for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)
+    {
+        radiusMatch(queryDescs, trainDescCollection[imgIdx], curMatches, maxDistance, 
+            masks.empty() ? GpuMat() : masks[imgIdx]);
+
+        for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)
+        {
+            vector<DMatch>& localMatch = curMatches[queryIdx];
+            vector<DMatch>& globalMatch = matches[queryIdx];
+
+            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
+
+            const size_t oldSize = globalMatch.size();
+
+            copy(localMatch.begin(), localMatch.end(), back_inserter(globalMatch));
+            inplace_merge(globalMatch.begin(), globalMatch.begin() + oldSize, globalMatch.end());
+        }
+    }
+
+    if (compactResult)
+    {
+        vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), 
+            mem_fun_ref(&vector<DMatch>::empty));
+        matches.erase(new_end, matches.end());
+    }
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
new file mode 100644
index 0000000..c2e2c1c
--- /dev/null
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -0,0 +1,1205 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "cuda_shared.hpp"
+#include "limits_gpu.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace bfmatcher
+{
+///////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////// General funcs //////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+    
+    template <bool expr> struct StaticAssert;
+    template <> struct StaticAssert<true> {static __host__ __device__ void check(){}};
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask strategy
+
+    class SingleMask
+    {
+    public:
+        explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
+        
+        __device__ bool operator()(int queryIdx, int trainIdx) const
+        {            
+            return mask.ptr(queryIdx)[trainIdx] != 0;
+        }
+    private:
+        PtrStep mask;
+    };
+
+    class MaskCollection
+    {
+    public:
+        explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
+
+        __device__ void nextMask()
+        {
+            curMask = *maskCollection++;
+        }
+        
+        __device__ bool operator()(int queryIdx, int trainIdx) const
+        {            
+            return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;
+        }
+    private:
+        PtrStep* maskCollection;
+        PtrStep curMask;
+    };
+
+    class WithOutMask
+    {
+    public:
+        __device__ void nextMask()
+        {
+        }
+        __device__ bool operator()(int queryIdx, int trainIdx) const
+        {
+            return true;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Reduce Sum
+    
+    template <int BLOCK_DIM_X>
+    __device__ void reduceSum(float* sdiff, float mySum, int tid)
+    {
+        sdiff[tid] = mySum;
+        __syncthreads();
+
+        if (BLOCK_DIM_X == 512) 
+        {
+            if (tid < 256) 
+            { 
+                sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); 
+                sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();
+                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads();
+            }
+            volatile float* smem = sdiff;
+            smem[tid] = mySum += smem[tid + 32]; 
+            smem[tid] = mySum += smem[tid + 16]; 
+            smem[tid] = mySum += smem[tid +  8]; 
+            smem[tid] = mySum += smem[tid +  4]; 
+            smem[tid] = mySum += smem[tid +  2];
+            smem[tid] = mySum += smem[tid +  1]; 
+        }
+        if (BLOCK_DIM_X == 256)
+        {
+            if (tid < 128) 
+            { 
+                sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); 
+                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads();
+            }
+            volatile float* smem = sdiff;
+            smem[tid] = mySum += smem[tid + 32]; 
+            smem[tid] = mySum += smem[tid + 16]; 
+            smem[tid] = mySum += smem[tid +  8]; 
+            smem[tid] = mySum += smem[tid +  4]; 
+            smem[tid] = mySum += smem[tid +  2];
+            smem[tid] = mySum += smem[tid +  1];
+        }
+        if (BLOCK_DIM_X == 128)
+        {
+            if (tid <  64) 
+            { 
+                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads(); 
+            }
+            volatile float* smem = sdiff;
+            smem[tid] = mySum += smem[tid + 32]; 
+            smem[tid] = mySum += smem[tid + 16]; 
+            smem[tid] = mySum += smem[tid +  8]; 
+            smem[tid] = mySum += smem[tid +  4]; 
+            smem[tid] = mySum += smem[tid +  2];
+            smem[tid] = mySum += smem[tid +  1];
+        }
+        
+        volatile float* smem = sdiff;
+        if (BLOCK_DIM_X == 64) 
+        {
+            if (tid < 32) 
+            {
+                smem[tid] = mySum += smem[tid + 32]; 
+                smem[tid] = mySum += smem[tid + 16]; 
+                smem[tid] = mySum += smem[tid +  8]; 
+                smem[tid] = mySum += smem[tid +  4]; 
+                smem[tid] = mySum += smem[tid +  2];
+                smem[tid] = mySum += smem[tid +  1];  
+            }
+        }
+        if (BLOCK_DIM_X == 32) 
+        {
+            if (tid < 16) 
+            {
+                smem[tid] = mySum += smem[tid + 16]; 
+                smem[tid] = mySum += smem[tid +  8]; 
+                smem[tid] = mySum += smem[tid +  4]; 
+                smem[tid] = mySum += smem[tid +  2];
+                smem[tid] = mySum += smem[tid +  1];  
+            }
+        }
+        if (BLOCK_DIM_X == 16) 
+        {
+            if (tid < 8) 
+            {
+                smem[tid] = mySum += smem[tid +  8]; 
+                smem[tid] = mySum += smem[tid +  4]; 
+                smem[tid] = mySum += smem[tid +  2];
+                smem[tid] = mySum += smem[tid +  1];  
+            }
+        }
+        if (BLOCK_DIM_X == 8) 
+        {
+            if (tid < 4) 
+            {
+                smem[tid] = mySum += smem[tid +  4]; 
+                smem[tid] = mySum += smem[tid +  2];
+                smem[tid] = mySum += smem[tid +  1];  
+            }
+        }
+        if (BLOCK_DIM_X == 4) 
+        {
+            if (tid < 2) 
+            {
+                smem[tid] = mySum += smem[tid +  2];
+                smem[tid] = mySum += smem[tid +  1];  
+            }
+        }
+        if (BLOCK_DIM_X == 2) 
+        {
+            if (tid < 1) 
+            {
+                smem[tid] = mySum += smem[tid +  1];  
+            }
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // loadDescsVals
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, typename T> 
+    __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals)
+    {
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        if (tid < desc_len)
+        {
+            smem[tid] = (float)descs[tid];
+        }
+        __syncthreads();
+
+        #pragma unroll
+        for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)
+        {
+            *queryVals = smem[i];
+            ++queryVals;
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Distance
+
+    template <int BLOCK_DIM_X>
+    class L1Dist
+    {
+    public:
+        __device__ L1Dist() : mySum(0) {}
+
+        __device__ void reduceIter(float val1, float val2)
+        {
+            mySum += fabs(val1 - val2);
+        }
+
+        __device__ void reduceAll(float* sdiff, int tid)
+        {
+            reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);
+        }
+
+        static __device__ float finalResult(float res)
+        {
+            return res;
+        }
+    private:
+        float mySum;
+    };
+
+    template <int BLOCK_DIM_X>
+    class L2Dist
+    {
+    public:
+        __device__ L2Dist() : mySum(0) {}
+
+        __device__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        __device__ void reduceAll(float* sdiff, int tid)
+        {
+            reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);
+        }
+
+        static __device__ float finalResult(float res)
+        {
+            return sqrtf(res);
+        }
+    private:
+        float mySum;
+    };
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // reduceDescDiff
+
+    template <int BLOCK_DIM_X, typename Dist, typename T> 
+    __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff)
+    {
+        const int tid = threadIdx.x;
+
+        Dist dist;
+
+        for (int i = tid; i < desc_len; i += BLOCK_DIM_X)
+            dist.reduceIter(queryDescs[i], trainDescs[i]);
+
+        dist.reduceAll(sdiff, tid);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // reduceDescDiff_smem
+
+    template <int N> struct UnrollDescDiff
+    {
+        template <typename Dist, typename T>
+        static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, 
+            int ind, int desc_len)
+        {
+            if (ind < desc_len)
+                dist.reduceIter(*queryVals, trainDescs[ind]);
+
+            ++queryVals;
+
+            UnrollDescDiff<N - 1>::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len);
+        }
+
+        template <typename Dist, typename T>
+        static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)
+        {
+            dist.reduceIter(*queryVals, *trainDescs);
+
+            ++queryVals;
+            trainDescs += blockDim.x;
+
+            UnrollDescDiff<N - 1>::calcWithoutCheck(dist, queryVals, trainDescs);
+        }
+    };
+    template <> struct UnrollDescDiff<0>
+    {
+        template <typename Dist, typename T>
+        static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, 
+            int ind, int desc_len)
+        {
+        }
+
+        template <typename Dist, typename T>
+        static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)
+        {
+        }
+    };
+
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool WITH_OUT_CHECK> struct DescDiffCalculator;
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> 
+    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
+    {
+        template <typename Dist, typename T>
+        static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)
+        {
+            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(dist, queryVals, trainDescs, 
+                threadIdx.x, desc_len);
+        }
+    };
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> 
+    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
+    {
+        template <typename Dist, typename T>
+        static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)
+        {
+            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(dist, queryVals, 
+                trainDescs + threadIdx.x);
+        }
+    };
+
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
+    __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff)
+    {
+        const int tid = threadIdx.x;
+        
+        Dist dist;
+
+        DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(dist, queryVals, 
+            trainDescs, desc_len);
+        
+        dist.reduceAll(sdiff, tid);
+    }
+
+///////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////// Match //////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // warpReduceMin
+
+    template <int BLOCK_DIM_Y> 
+    __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx)
+    {
+        float minSum = sdata[tid];
+
+        if (BLOCK_DIM_Y >= 64) 
+        {
+            float reg = sdata[tid + 32];
+            if (reg < minSum)
+            {
+                sdata[tid] = minSum = reg;
+                strainIdx[tid] = strainIdx[tid + 32];
+                simgIdx[tid] = simgIdx[tid + 32];
+            }
+        }
+        if (BLOCK_DIM_Y >= 32) 
+        {
+            float reg = sdata[tid + 16];
+            if (reg < minSum)
+            {
+                sdata[tid] = minSum = reg;
+                strainIdx[tid] = strainIdx[tid + 16];
+                simgIdx[tid] = simgIdx[tid + 16];
+            }
+        }
+        if (BLOCK_DIM_Y >= 16) 
+        {
+            float reg = sdata[tid + 8];
+            if (reg < minSum)
+            {
+                sdata[tid] = minSum = reg;
+                strainIdx[tid] = strainIdx[tid + 8];
+                simgIdx[tid] = simgIdx[tid + 8];
+            }
+        }
+        if (BLOCK_DIM_Y >= 8) 
+        { 
+            float reg = sdata[tid + 4];
+            if (reg < minSum)
+            {
+                sdata[tid] = minSum = reg;
+                strainIdx[tid] = strainIdx[tid + 4];
+                simgIdx[tid] = simgIdx[tid + 4];
+            }
+        }
+        if (BLOCK_DIM_Y >= 4) 
+        { 
+            float reg = sdata[tid + 2];
+            if (reg < minSum)
+            {
+                sdata[tid] = minSum = reg;
+                strainIdx[tid] = strainIdx[tid + 2];
+                simgIdx[tid] = simgIdx[tid + 2];
+            }
+        }
+        if (BLOCK_DIM_Y >= 2) 
+        {
+            float reg = sdata[tid + 1];
+            if (reg < minSum)
+            {
+                sdata[tid] = minSum = reg;
+                strainIdx[tid] = strainIdx[tid + 1];
+                simgIdx[tid] = simgIdx[tid + 1];
+            }
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // findBestMatch
+
+    template <int BLOCK_DIM_Y, typename Dist>
+    __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, 
+        float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance)
+    {
+        if (threadIdx.x == 0)
+        {
+            smin[threadIdx.y] = myMin;
+            strainIdx[threadIdx.y] = myBestTrainIdx;
+            simgIdx[threadIdx.y] = myBestImgIdx;
+        }
+        __syncthreads();
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        if (tid < 32)
+            warpReduceMin<BLOCK_DIM_Y>(tid, smin, strainIdx, simgIdx);
+
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+        {
+            float minSum = smin[0];
+            int bestTrainIdx = strainIdx[0];
+            int bestImgIdx = simgIdx[0];
+
+            imgIdx[queryIdx] = bestImgIdx;
+            trainIdx[queryIdx] = bestTrainIdx;
+            distance[queryIdx] = Dist::finalResult(minSum);
+        }
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // ReduceDescCalculator
+
+    template <int BLOCK_DIM_X, typename Dist, typename T>
+    class ReduceDescCalculatorSimple
+    {
+    public:
+        __device__ void prepare(const T* queryDescs_, int, float*)
+        {
+            queryDescs = queryDescs_;
+        }
+
+        __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const
+        {
+            reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, desc_len, sdiff_row);
+        }
+
+    private:
+        const T* queryDescs;
+    };
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, 
+        typename Dist, typename T>
+    class ReduceDescCalculatorSmem
+    {
+    public:
+        __device__ void prepare(const T* queryDescs, int desc_len, float* smem)
+        {
+            loadDescsVals<BLOCK_DIM_X, BLOCK_DIM_Y, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, smem, queryVals);
+        }
+
+        __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const
+        {
+            reduceDescDiff_smem<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist>(queryVals, trainDescs, 
+                desc_len, sdiff_row);
+        }
+
+    private:
+        float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
+    };
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // matchDescs loop
+
+    template <typename ReduceDescCalculator, typename T, typename Mask>
+    __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_<T>& trainDescs_,  
+        const Mask& m, const ReduceDescCalculator& reduceDescCalc,
+        float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx)
+    {
+        const T* trainDescs = trainDescs_.ptr(threadIdx.y);
+        const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T);
+        for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; 
+             trainIdx += blockDim.y, trainDescs += trainDescsStep)
+        {
+            if (m(queryIdx, trainIdx))
+            {
+                reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row);
+
+                if (threadIdx.x == 0)
+                {
+                    float reg = sdiff_row[0];
+                    if (reg < myMin)
+                    {
+                        myMin = reg;
+                        myBestTrainIdx = trainIdx;
+                        myBestImgIdx = imgIdx;
+                    }
+                }
+            }
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Train collection loop strategy
+
+    template <typename T>
+    class SingleTrain
+    {
+    public:
+        explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)
+        {
+        }
+
+        template <typename ReduceDescCalculator, typename Mask>
+        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
+            float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const
+        {
+            matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, 
+                sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);
+        }
+
+        __device__ int desc_len() const
+        {
+            return trainDescs.cols;
+        }
+    private:
+        DevMem2D_<T> trainDescs;
+    };
+
+    template <typename T>
+    class TrainCollection
+    {
+    public:
+        TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : 
+            trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
+        {
+        }
+
+        template <typename ReduceDescCalculator, typename Mask>
+        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
+            float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const
+        {
+            for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
+            {
+                DevMem2D_<T> trainDescs = trainCollection[imgIdx];
+                m.nextMask();
+                matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, 
+                    sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);
+            }
+        }
+
+        __device__ int desc_len() const
+        {
+            return desclen;
+        }
+    private:
+        const DevMem2D_<T>* trainCollection;
+        int nImg;
+        int desclen;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match kernel
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, 
+        typename Train, typename Mask>
+    __global__ void match(PtrStep_<T> queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance)
+    {
+        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
+        __shared__ float smin[64];
+        __shared__ int strainIdx[64];
+        __shared__ int simgIdx[64];
+        
+        const int queryIdx = blockIdx.x;
+        
+        int myBestTrainIdx = -1;
+        int myBestImgIdx = -1;
+        float myMin = numeric_limits_gpu<float>::max();
+
+        {
+            float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
+            Mask m = mask;
+            ReduceDescCalculator reduceDescCalc;
+            reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff);
+        
+            train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);
+        }
+
+        findBestMatch<BLOCK_DIM_Y, Dist>(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, 
+            smin, strainIdx, simgIdx, trainIdx, imgIdx, distance);
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match kernel callers
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, 
+        typename Train, typename Mask>
+    void match_caller(const DevMem2D_<T>& queryDescs, const Train& train, 
+        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
+
+        dim3 grid(queryDescs.rows, 1, 1);
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, Dist<BLOCK_DIM_X>, T>, 
+            Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, 
+            imgIdx.data, distance.data);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, 
+        template <int> class Dist, typename T, typename Train, typename Mask>
+    void match_smem_caller(const DevMem2D_<T>& queryDescs, const Train& train, 
+        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        StaticAssert<BLOCK_DIM_Y <= 64>::check();                                // blockDimY vals must reduce by warp
+        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
+        StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX
+
+        dim3 grid(queryDescs.rows, 1, 1);
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSmem<BLOCK_DIM_X, BLOCK_DIM_Y, 
+              MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist<BLOCK_DIM_X>, T>, 
+              Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, 
+              imgIdx.data, distance.data);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Match kernel chooser
+
+    template <template <int> class Dist, typename T, typename Train, typename Mask>
+    void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, 
+        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        if (queryDescs.cols < 64)
+            match_smem_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+        else if (queryDescs.cols == 64)
+            match_smem_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+        else if (queryDescs.cols < 128)
+            match_smem_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+        else if (queryDescs.cols == 128)
+            match_smem_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+        else if (queryDescs.cols < 256)
+            match_smem_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+        else if (queryDescs.cols == 256)
+            match_smem_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+        else
+            match_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    template <typename T>
+    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        SingleTrain<T> train((DevMem2D_<T>)trainDescs);
+        if (mask.data)
+        {
+            SingleMask m(mask);
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
+        }
+        else
+        {
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
+        }
+    }
+
+    template void matchSingleL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+
+    template <typename T>
+    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, 
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        SingleTrain<T> train((DevMem2D_<T>)trainDescs);
+        if (mask.data)
+        {
+            SingleMask m(mask);
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
+        }
+        else
+        {
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
+        }
+    }
+
+    template void matchSingleL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchSingleL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+
+    template <typename T>
+    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
+        if (maskCollection.data)
+        {
+            MaskCollection mask(maskCollection.data);
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
+        }
+        else
+        {
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
+        }
+    }
+
+    template void matchCollectionL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+
+    template <typename T>
+    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, 
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
+    {
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
+        if (maskCollection.data)
+        {
+            MaskCollection mask(maskCollection.data);
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
+        }
+        else
+        {
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
+        }
+    }
+
+    template void matchCollectionL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    template void matchCollectionL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
+    
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////// Knn Match ////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calc distance kernel
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    __global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)
+    {
+        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
+
+        float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
+        
+        const int queryIdx = blockIdx.x;
+        const T* queryDescs = queryDescs_.ptr(queryIdx);
+
+        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+        if (trainIdx < trainDescs_.rows)
+        {
+            const T* trainDescs = trainDescs_.ptr(trainIdx);
+
+            float dist = numeric_limits_gpu<float>::max();
+
+            if (mask(queryIdx, trainIdx))
+            {
+                reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);
+
+                if (threadIdx.x == 0)
+                {
+                    dist = Dist::finalResult(sdiff_row[0]);
+                }
+            }
+            
+            if (threadIdx.x == 0)
+                distance.ptr(queryIdx)[trainIdx] = dist;
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calc distance kernel caller
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>
+    void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
+        const Mask& mask, const DevMem2Df& distance)
+    {
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
+
+        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(
+            queryDescs, trainDescs, mask, distance);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+        
+    ///////////////////////////////////////////////////////////////////////////////
+    // reduceMin
+
+    template <int BLOCK_SIZE> 
+    __device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)
+    {
+        if (tid < 32)
+        {
+            if (BLOCK_SIZE >= 64) 
+            { 
+                float reg = sdist[tid + 32];
+
+                if (reg < myMin)
+                {
+                    sdist[tid] = myMin = reg;
+                    strainIdx[tid] = strainIdx[tid + 32];
+                }
+            }
+            if (BLOCK_SIZE >= 32) 
+            { 
+                float reg = sdist[tid + 16];
+
+                if (reg < myMin)
+                {
+                    sdist[tid] = myMin = reg;
+                    strainIdx[tid] = strainIdx[tid + 16];
+                }
+            }
+            if (BLOCK_SIZE >= 16) 
+            { 
+                float reg = sdist[tid + 8];
+
+                if (reg < myMin)
+                {
+                    sdist[tid] = myMin = reg;
+                    strainIdx[tid] = strainIdx[tid + 8];
+                }
+            }
+            if (BLOCK_SIZE >= 8) 
+            { 
+                float reg = sdist[tid + 4];
+
+                if (reg < myMin)
+                {
+                    sdist[tid] = myMin = reg;
+                    strainIdx[tid] = strainIdx[tid + 4];
+                }
+            }
+            if (BLOCK_SIZE >= 4) 
+            { 
+                float reg = sdist[tid + 2];
+
+                if (reg < myMin)
+                {
+                    sdist[tid] = myMin = reg;
+                    strainIdx[tid] = strainIdx[tid + 2];
+                } 
+            }
+            if (BLOCK_SIZE >= 2) 
+            { 
+                float reg = sdist[tid + 1];
+
+                if (reg < myMin)
+                {
+                    sdist[tid] = myMin = reg;
+                    strainIdx[tid] = strainIdx[tid + 1];
+                }
+            }
+        }
+    }
+    
+    template <int BLOCK_SIZE> 
+    __device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx)
+    {
+        const int tid = threadIdx.x;
+        
+        float myMin = numeric_limits_gpu<float>::max();
+        int myMinIdx = -1;
+
+        for (int i = tid; i < n; i += BLOCK_SIZE)
+        {
+            float reg = dist[i];
+            if (reg < myMin)
+            {
+                myMin = reg;
+                myMinIdx = i;
+            }
+        }
+
+        sdist[tid] = myMin;
+        strainIdx[tid] = myMinIdx;
+        __syncthreads();
+
+        if (BLOCK_SIZE >= 512 && tid < 256) 
+        {
+            float reg = sdist[tid + 256];
+
+            if (reg < myMin)
+            {
+                sdist[tid] = myMin = reg;
+                strainIdx[tid] = strainIdx[tid + 256];
+            }
+            __syncthreads(); 
+        }
+        if (BLOCK_SIZE >= 256 && tid < 128) 
+        {
+            float reg = sdist[tid + 128];
+
+            if (reg < myMin)
+            {
+                sdist[tid] = myMin = reg;
+                strainIdx[tid] = strainIdx[tid + 128];
+            }
+            __syncthreads(); 
+        }
+        if (BLOCK_SIZE >= 128 && tid < 64) 
+        {
+            float reg = sdist[tid + 64];
+
+            if (reg < myMin)
+            {
+                sdist[tid] = myMin = reg;
+                strainIdx[tid] = strainIdx[tid + 64];
+            }
+            __syncthreads(); 
+        }
+        
+        warpReduceMinIdx<BLOCK_SIZE>(sdist, strainIdx, myMin, tid);
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // find knn match kernel
+
+    template <int BLOCK_SIZE>
+    __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
+    {
+        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
+        __shared__ float sdist[SMEM_SIZE];
+        __shared__ int strainIdx[SMEM_SIZE];
+
+        const int queryIdx = blockIdx.x;
+
+        float* allDist = allDist_.ptr(queryIdx);
+        int* trainIdx = trainIdx_.ptr(queryIdx);
+        float* distance = distance_.ptr(queryIdx);
+
+        reduceMinIdx<BLOCK_SIZE>(allDist, allDist_.cols, sdist, strainIdx);
+
+        if (threadIdx.x == 0)
+        {
+            float dist = sdist[0];
+            if (dist < numeric_limits_gpu<float>::max())
+            {
+                int bestIdx = strainIdx[0];
+                allDist[bestIdx] = numeric_limits_gpu<float>::max();
+                trainIdx[i] = bestIdx;
+                distance[i] = dist;
+            }
+        }
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // find knn match kernel caller
+
+    template <int BLOCK_SIZE>
+    void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
+    {
+        dim3 threads(BLOCK_SIZE, 1, 1);
+        dim3 grid(trainIdx.rows, 1, 1);
+
+        for (int i = 0; i < knn; ++i)
+            findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);
+        
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // knn match caller
+
+    template <typename T>
+    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
+    {
+        if (mask.data)
+        {
+            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                SingleMask(mask), allDist);
+        }
+        else
+        {
+            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                WithOutMask(), allDist);
+        }
+
+        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+    }
+
+    template void knnMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+
+    template <typename T>
+    void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
+    {
+        if (mask.data)
+        {
+            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                SingleMask(mask), allDist);
+        }
+        else
+        {
+            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                WithOutMask(), allDist);
+        }
+
+        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+    }
+
+    template void knnMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+    template void knnMatchL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
+
+///////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Radius Match //////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match kernel
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
+    __global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, 
+        float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
+    {
+        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
+
+        float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
+        
+        const int queryIdx = blockIdx.x;
+        const T* queryDescs = queryDescs_.ptr(queryIdx);
+
+        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+        if (trainIdx < trainDescs_.rows)
+        {
+            const T* trainDescs = trainDescs_.ptr(trainIdx);
+
+            if (mask(queryIdx, trainIdx))
+            {
+                reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);
+
+                if (threadIdx.x == 0)
+                {
+                    float dist = Dist::finalResult(sdiff_row[0]);
+                    if (dist < maxDistance)
+                    {
+                        unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                        if (i < trainIdx_.cols)
+                        {
+                            distance.ptr(queryIdx)[i] = dist;
+                            trainIdx_.ptr(queryIdx)[i] = trainIdx;
+                        }
+                    }
+                }
+            }
+        }
+    }
+        
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match kernel caller
+
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>
+    void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
+        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, 
+        const DevMem2Df& distance)
+    {
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+        dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
+
+        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(
+            queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Radius Match kernel chooser
+
+    template <typename T>
+    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
+    {
+        if (mask.data)
+        {
+            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
+        }
+        else
+        {
+            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
+        }
+    }
+
+    template void radiusMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+
+    template <typename T>
+    void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
+    {
+        if (mask.data)
+        {
+            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
+        }
+        else
+        {
+            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
+        }
+    }
+
+    template void radiusMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+    template void radiusMatchL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
+}}}
diff --git a/tests/gpu/src/brute_force_matcher.cpp b/tests/gpu/src/brute_force_matcher.cpp
new file mode 100644
index 0000000..b4867e1
--- /dev/null
+++ b/tests/gpu/src/brute_force_matcher.cpp
@@ -0,0 +1,175 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "gputest.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+class CV_GpuBruteForceMatcherTest : public CvTest 
+{
+public:
+    CV_GpuBruteForceMatcherTest() : CvTest( "GPU-BruteForceMatcher", "BruteForceMatcher" ) {}
+
+protected:
+    void run(int) 
+    {
+        try 
+        {
+            BruteForceMatcher< L2<float> > matcherCPU;
+            BruteForceMatcher_GPU< L2<float> > matcherGPU;
+            
+            vector<DMatch> matchesCPU, matchesGPU;
+            vector< vector<DMatch> > knnMatchesCPU, knnMatchesGPU;
+            vector< vector<DMatch> > radiusMatchesCPU, radiusMatchesGPU;
+
+            RNG rng(*ts->get_rng());
+
+            const int desc_len = rng.uniform(40, 300);
+
+            Mat queryCPU(rng.uniform(100, 300), desc_len, CV_32F);            
+            rng.fill(queryCPU, cv::RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar::all(1.0));
+            GpuMat queryGPU(queryCPU);
+
+            const int nTrains = rng.uniform(1, 5);
+
+            vector<Mat> trainsCPU(nTrains);
+            vector<GpuMat> trainsGPU(nTrains);
+
+            vector<Mat> masksCPU(nTrains);
+            vector<GpuMat> masksGPU(nTrains);
+
+            for (int i = 0; i < nTrains; ++i)
+            {
+                Mat train(rng.uniform(100, 300), desc_len, CV_32F);
+                rng.fill(train, cv::RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar::all(1.0));
+
+                trainsCPU[i] = train;
+                trainsGPU[i].upload(train);
+
+                bool with_mask = rng.uniform(0, 10) < 5;
+                if (with_mask)
+                {
+                    Mat mask(queryCPU.rows, train.rows, CV_8U, Scalar::all(1));
+                    rng.fill(mask, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(200));
+
+                    masksCPU[i] = mask;
+                    masksGPU[i].upload(mask);
+                }
+            }
+
+            matcherCPU.add(trainsCPU);
+            matcherGPU.add(trainsGPU);
+
+            matcherCPU.match(queryCPU, matchesCPU, masksCPU);
+            matcherGPU.match(queryGPU, matchesGPU, masksGPU);
+
+            if (!compareMatches(matchesCPU, matchesGPU))
+            {
+                ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
+                return;
+            }
+
+            const int knn = rng.uniform(3, 10);
+
+            matcherCPU.knnMatch(queryCPU, knnMatchesCPU, knn, masksCPU);
+            matcherGPU.knnMatch(queryGPU, knnMatchesGPU, knn, masksGPU);
+
+            if (!compareMatches(knnMatchesCPU, knnMatchesGPU))
+            {
+                ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
+                return;
+            }
+
+            const float maxDistance = rng.uniform(0.01f, 0.3f);
+            
+            matcherCPU.radiusMatch(queryCPU, radiusMatchesCPU, maxDistance, masksCPU);
+            matcherGPU.radiusMatch(queryGPU, radiusMatchesGPU, maxDistance, masksGPU);
+
+            if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU))
+            {
+                ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
+                return;
+            }
+        }
+        catch (const cv::Exception& e) 
+        {
+            if (!check_and_treat_gpu_exception(e, ts))
+                throw;
+            return;
+        }
+
+        ts->set_failed_test_info(CvTS::OK);
+    }
+
+private:
+    static void convertMatches(const vector< vector<DMatch> >& knnMatches, vector<DMatch>& matches)
+    {
+        matches.clear();
+        for (size_t i = 0; i < knnMatches.size(); ++i)
+            copy(knnMatches[i].begin(), knnMatches[i].end(), back_inserter(matches));
+    }
+
+    static bool compareMatches(const vector<DMatch>& matches1, const vector<DMatch>& matches2)
+    {
+        if (matches1.size() != matches2.size())
+            return false;
+
+        struct DMatchEqual : public binary_function<DMatch, DMatch, bool>
+        {
+            bool operator()(const DMatch& m1, const DMatch& m2)
+            {
+                return m1.imgIdx == m2.imgIdx && m1.queryIdx == m2.queryIdx && m1.trainIdx == m2.trainIdx;
+            }
+        };
+
+        return equal(matches1.begin(), matches1.end(), matches2.begin(), DMatchEqual());
+    }
+
+    static bool compareMatches(const vector< vector<DMatch> >& matches1, const vector< vector<DMatch> >& matches2)
+    {
+        vector<DMatch> m1, m2;
+        convertMatches(matches1, m1);
+        convertMatches(matches2, m2);
+        return compareMatches(m1, m2);
+    }
+} brute_force_matcher_test;
\ No newline at end of file
diff --git a/tests/gpu/src/gputest.hpp b/tests/gpu/src/gputest.hpp
index fea8ef9..cfb2b33 100644
--- a/tests/gpu/src/gputest.hpp
+++ b/tests/gpu/src/gputest.hpp
@@ -50,7 +50,8 @@
 
 #include <opencv2/gpu/gpu.hpp>
 #include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/features2d/features2d.hpp>
 #include "cxts.h"
 
 /****************************************************************************************/
-- 
2.7.4