added BruteForceMatcher_GPU
authorVladislav Vinogradov <no@email>
Mon, 6 Dec 2010 12:06:51 +0000 (12:06 +0000)
committerVladislav Vinogradov <no@email>
Mon, 6 Dec 2010 12:06:51 +0000 (12:06 +0000)
modules/gpu/CMakeLists.txt
modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/src/brute_force_matcher.cpp [new file with mode: 0644]
modules/gpu/src/cuda/brute_force_matcher.cu [new file with mode: 0644]
tests/gpu/src/brute_force_matcher.cpp [new file with mode: 0644]
tests/gpu/src/gputest.hpp

index 8181c3d..f41817b 100644 (file)
@@ -1,6 +1,6 @@
 \r
 set(name "gpu")\r
-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect")\r
+set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann")\r
 \r
 set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)\r
 \r
index a8d3594..53a1a74 100644 (file)
@@ -48,6 +48,7 @@
 #include "opencv2/imgproc/imgproc.hpp"\r
 #include "opencv2/objdetect/objdetect.hpp"\r
 #include "opencv2/gpu/devmem2d.hpp"\r
+#include "opencv2/features2d/features2d.hpp"\r
 \r
 namespace cv\r
 {\r
@@ -1118,7 +1119,152 @@ namespace cv
 \r
             // Gradients conputation results\r
             GpuMat grad, qangle;            \r
-        };    \r
+        };\r
+\r
+\r
+        ////////////////////////////////// BruteForceMatcher //////////////////////////////////\r
+\r
+        class CV_EXPORTS BruteForceMatcher_GPU_base\r
+        {\r
+        public:\r
+            enum DistType {L1Dist = 0, L2Dist};\r
+\r
+            explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);\r
+\r
+            // Add descriptors to train descriptor collection.\r
+            void add(const std::vector<GpuMat>& descCollection);\r
+\r
+            // Get train descriptors collection.\r
+            const std::vector<GpuMat>& getTrainDescriptors() const;\r
+\r
+            // Clear train descriptors collection.\r
+            void clear();\r
+\r
+            // Return true if there are not train descriptors in collection.\r
+            bool empty() const;\r
+\r
+            // Return true if the matcher supports mask in match methods.\r
+            bool isMaskSupported() const;\r
+\r
+            // Find one best match for each query descriptor.\r
+            // trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx\r
+            // distance.at<float>(0, queryIdx) will contain distance\r
+            void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+                GpuMat& trainIdx, GpuMat& distance,\r
+                const GpuMat& mask = GpuMat());\r
+\r
+            // Download trainIdx and distance to CPU vector with DMatch\r
+            static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);\r
+\r
+            // Find one best match for each query descriptor.\r
+            void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector<DMatch>& matches, \r
+                const GpuMat& mask = GpuMat());\r
+\r
+            // Make gpu collection of trains and masks in suitable format for matchCollection function\r
+            void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,\r
+                const vector<GpuMat>& masks = std::vector<GpuMat>());\r
+\r
+            // Find one best match from train collection for each query descriptor.\r
+            // trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx\r
+            // imgIdx.at<int>(0, queryIdx) will contain best image index for queryIdx\r
+            // distance.at<float>(0, queryIdx) will contain distance\r
+            void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, \r
+                GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, \r
+                const GpuMat& maskCollection);\r
+\r
+            // Download trainIdx, imgIdx and distance to CPU vector with DMatch\r
+            static void matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, const GpuMat& distance, \r
+                std::vector<DMatch>& matches);\r
+\r
+            // Find one best match from train collection for each query descriptor.\r
+            void match(const GpuMat& queryDescs, std::vector<DMatch>& matches, \r
+                const std::vector<GpuMat>& masks = std::vector<GpuMat>());\r
+\r
+            // Find k best matches for each query descriptor (in increasing order of distances).\r
+            // trainIdx.at<int>(queryIdx, i) will contain index of i'th best trains (i < k).\r
+            // distance.at<float>(queryIdx, i) will contain distance.\r
+            // allDist is a buffer to store all distance between query descriptors and train descriptors\r
+            // it have size (nQuery,nTrain) and CV_32F type\r
+            // allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best, \r
+            // otherwise it will contain distance between queryIdx and trainIdx descriptors\r
+            void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+                GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat());\r
+\r
+            // Download trainIdx and distance to CPU vector with DMatch\r
+            // compactResult is used when mask is not empty. If compactResult is false matches\r
+            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+            // matches vector will not contain matches for fully masked out query descriptors.\r
+            static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,\r
+                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+\r
+            // Find k best matches for each query descriptor (in increasing order of distances).\r
+            // compactResult is used when mask is not empty. If compactResult is false matches\r
+            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+            // matches vector will not contain matches for fully masked out query descriptors.\r
+            void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+                std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(), \r
+                bool compactResult = false);            \r
+\r
+            // Find k best matches  for each query descriptor (in increasing order of distances).\r
+            // compactResult is used when mask is not empty. If compactResult is false matches\r
+            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+            // matches vector will not contain matches for fully masked out query descriptors.\r
+            void knnMatch(const GpuMat& queryDescs, std::vector< std::vector<DMatch> >& matches, int knn, \r
+                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false );\r
+\r
+            // Find best matches for each query descriptor which have distance less than maxDistance.\r
+            // nMatches.at<unsigned int>(0, queruIdx) will contain matches count for queryIdx.\r
+            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,\r
+            // because it didn't have enough memory.\r
+            // trainIdx.at<int>(queruIdx, i) will contain ith train index (i < min(nMatches.at<unsigned int>(0, queruIdx), trainIdx.cols))\r
+            // distance.at<int>(queruIdx, i) will contain ith distance (i < min(nMatches.at<unsigned int>(0, queruIdx), trainIdx.cols))\r
+            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x nTrain,\r
+            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches\r
+            // Matches doesn't sorted.\r
+            void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+                GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, \r
+                const GpuMat& mask = GpuMat());\r
+\r
+            // Download trainIdx, nMatches and distance to CPU vector with DMatch.\r
+            // matches will be sorted in increasing order of distances.\r
+            // compactResult is used when mask is not empty. If compactResult is false matches\r
+            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+            // matches vector will not contain matches for fully masked out query descriptors.\r
+            static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance,\r
+                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+\r
+            // Find best matches for each query descriptor which have distance less than maxDistance \r
+            // in increasing order of distances).\r
+            void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+                std::vector< std::vector<DMatch> >& matches, float maxDistance, \r
+                const GpuMat& mask = GpuMat(), bool compactResult = false);\r
+\r
+            // Find best matches from train collection for each query descriptor which have distance less than\r
+            // maxDistance (in increasing order of distances).\r
+            void radiusMatch(const GpuMat& queryDescs, std::vector< std::vector<DMatch> >& matches, float maxDistance, \r
+                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);        \r
+        \r
+        private:\r
+            DistType distType;\r
+\r
+            std::vector<GpuMat> trainDescCollection;\r
+        };\r
+\r
+        template <class Distance>\r
+        class CV_EXPORTS BruteForceMatcher_GPU;\r
+\r
+        template <typename T>\r
+        class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BruteForceMatcher_GPU_base\r
+        {\r
+        public:\r
+            explicit BruteForceMatcher_GPU(L1<T> d = L1<T>()) : BruteForceMatcher_GPU_base(L1Dist) {}\r
+        };\r
+        template <typename T>\r
+        class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BruteForceMatcher_GPU_base\r
+        {\r
+        public:\r
+            explicit BruteForceMatcher_GPU(L2<T> d = L2<T>()) : BruteForceMatcher_GPU_base(L2Dist) {}\r
+        };\r
     }\r
 \r
 \r
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
new file mode 100644 (file)
index 0000000..1dd3c0a
--- /dev/null
@@ -0,0 +1,605 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other GpuMaterials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or bpied warranties, including, but not limited to, the bpied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "precomp.hpp"\r
+\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+using namespace std;\r
+\r
+#if !defined (HAVE_CUDA)\r
+\r
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>&) { throw_nogpu(); }\r
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }\r
+void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); }\r
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; }\r
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }\r
+\r
+#else /* !defined (HAVE_CUDA) */\r
+\r
+namespace cv { namespace gpu { namespace bfmatcher\r
+{\r
+    template <typename T>\r
+    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template <typename T>\r
+    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template <typename T>\r
+    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+        const DevMem2Df& distance);\r
+    template <typename T>\r
+    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+        const DevMem2Df& distance);\r
+\r
+    template <typename T>\r
+    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template <typename T>\r
+    void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+    template <typename T>\r
+    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template <typename T>\r
+    void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+}}}\r
+\r
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)\r
+{\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// Train collection\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)\r
+{\r
+    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());\r
+}\r
+\r
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const \r
+{\r
+    return trainDescCollection;\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::clear() \r
+{\r
+    trainDescCollection.clear();\r
+}\r
+\r
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const\r
+{\r
+    return trainDescCollection.empty();\r
+}\r
+\r
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const\r
+{\r
+    return true;\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// Match\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+    GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask)\r
+{\r
+    using namespace cv::gpu::bfmatcher;\r
+\r
+    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+    static const match_caller_t match_callers[2][8] = \r
+    {\r
+        {\r
+            matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>, \r
+            matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0\r
+        },\r
+        {\r
+            matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>, \r
+            matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0\r
+        }\r
+    };\r
+    \r
+    CV_Assert(queryDescs.channels() == 1);\r
+    CV_Assert(trainDescs.cols == queryDescs.cols && trainDescs.type() == queryDescs.type());\r
+    \r
+    const int nQuery = queryDescs.rows;\r
+\r
+    trainIdx.create(1, nQuery, CV_32S);\r
+    distance.create(1, nQuery, CV_32F);\r
+\r
+    match_caller_t func = match_callers[distType][queryDescs.depth()];\r
+    CV_Assert(func != 0);\r
+\r
+    // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. \r
+    // trainIdx store after imgIdx, so we doesn't lose it value.\r
+    func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, \r
+    vector<DMatch>& matches)\r
+{\r
+    const int nQuery = trainIdx.cols;\r
+\r
+    Mat trainIdxCPU = trainIdx;\r
+    Mat distanceCPU = distance;\r
+\r
+    matches.clear();\r
+    matches.reserve(nQuery);\r
+\r
+    const int* trainIdx_ptr = trainIdxCPU.ptr<int>();\r
+    const float* distance_ptr =  distanceCPU.ptr<float>();\r
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)\r
+    {\r
+        int trainIdx = *trainIdx_ptr;\r
+        if (trainIdx == -1)\r
+            continue;\r
+\r
+        float distance = *distance_ptr;\r
+\r
+        DMatch m(queryIdx, trainIdx, 0, distance);\r
+\r
+        matches.push_back(m);\r
+    }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+    vector<DMatch>& matches, const GpuMat& mask)\r
+{\r
+    GpuMat trainIdx, distance;\r
+    matchSingle(queryDescs, trainDescs, trainIdx, distance, mask);\r
+    matchDownload(trainIdx, distance, matches);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, \r
+    const vector<GpuMat>& masks)\r
+{\r
+    if (masks.empty())\r
+    {        \r
+        Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));\r
+\r
+        for (size_t i = 0; i < trainDescCollection.size(); ++i)\r
+        {\r
+            const GpuMat& trainDescs = trainDescCollection[i];\r
+\r
+            trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;\r
+        }\r
+\r
+        trainCollection.upload(trainCollectionCPU);\r
+    }\r
+    else\r
+    {\r
+        CV_Assert(masks.size() == trainDescCollection.size());\r
+        \r
+        Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));\r
+        Mat maskCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(PtrStep)));\r
+\r
+        for (size_t i = 0; i < trainDescCollection.size(); ++i)\r
+        {\r
+            const GpuMat& trainDescs = trainDescCollection[i];\r
+            const GpuMat& mask = masks[i];\r
+\r
+            CV_Assert(mask.empty() || (mask.type() == CV_8UC1));\r
+\r
+            trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;\r
+\r
+            maskCollectionCPU.ptr<PtrStep>(0)[i] = static_cast<PtrStep>(mask);\r
+        }\r
+\r
+        trainCollection.upload(trainCollectionCPU);\r
+        maskCollection.upload(maskCollectionCPU);\r
+    }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, \r
+    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection)\r
+{\r
+    using namespace cv::gpu::bfmatcher;\r
+\r
+    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+        const DevMem2Df& distance);\r
+\r
+    static const match_caller_t match_callers[2][8] = \r
+    {\r
+        {\r
+            matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>, \r
+            matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>, \r
+            matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0\r
+        },\r
+        {\r
+            matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>, \r
+            matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>, \r
+            matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0\r
+        }\r
+    };\r
+    \r
+    CV_Assert(queryDescs.channels() == 1);\r
+    \r
+    const int nQuery = queryDescs.rows;\r
+\r
+    trainIdx.create(1, nQuery, CV_32S);\r
+    imgIdx.create(1, nQuery, CV_32S);\r
+    distance.create(1, nQuery, CV_32F);\r
+\r
+    match_caller_t func = match_callers[distType][queryDescs.depth()];\r
+    CV_Assert(func != 0);\r
+\r
+    func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, \r
+    const GpuMat& distance, vector<DMatch>& matches)\r
+{\r
+    const int nQuery = trainIdx.cols;\r
+\r
+    Mat trainIdxCPU = trainIdx;\r
+    Mat imgIdxCPU = imgIdx;\r
+    Mat distanceCPU = distance;\r
+\r
+    matches.clear();\r
+    matches.reserve(nQuery);\r
+\r
+    const int* trainIdx_ptr = trainIdxCPU.ptr<int>();\r
+    const int* imgIdx_ptr = imgIdxCPU.ptr<int>();\r
+    const float* distance_ptr =  distanceCPU.ptr<float>();\r
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)\r
+    {\r
+        int trainIdx = *trainIdx_ptr;\r
+        if (trainIdx == -1)\r
+            continue;\r
+\r
+        int imgIdx = *imgIdx_ptr;\r
+\r
+        float distance = *distance_ptr;\r
+\r
+        DMatch m(queryIdx, trainIdx, imgIdx, distance);\r
+\r
+        matches.push_back(m);\r
+    }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector<DMatch>& matches, \r
+    const vector<GpuMat>& masks)\r
+{    \r
+    GpuMat trainCollection;\r
+    GpuMat maskCollection;\r
+\r
+    makeGpuCollection(trainCollection, maskCollection, masks);\r
+\r
+    GpuMat trainIdx, imgIdx, distance;\r
+\r
+    matchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, maskCollection);\r
+    matchDownload(trainIdx, imgIdx, distance, matches);\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// KnnMatch\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask)\r
+{\r
+    using namespace cv::gpu::bfmatcher;\r
+\r
+    typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+    static const match_caller_t match_callers[2][8] = \r
+    {\r
+        {\r
+            knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>, \r
+            knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0\r
+        },\r
+        {\r
+            knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>, \r
+            knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0\r
+        }\r
+    };\r
+    \r
+    CV_Assert(queryDescs.channels() == 1);\r
+    \r
+    const int nQuery = queryDescs.rows;\r
+    const int nTrain = trainDescs.rows;\r
+\r
+    trainIdx.create(nQuery, k, CV_32S);\r
+    trainIdx.setTo(Scalar::all(-1));\r
+    distance.create(nQuery, k, CV_32F);\r
+\r
+    allDist.create(nQuery, nTrain, CV_32F);\r
+\r
+    match_caller_t func = match_callers[distType][queryDescs.depth()];\r
+    CV_Assert(func != 0);\r
+\r
+    func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,\r
+    vector< vector<DMatch> >& matches, bool compactResult)\r
+{\r
+    const int nQuery = distance.rows;\r
+    const int k = trainIdx.cols;\r
+\r
+    Mat trainIdxCPU = trainIdx;\r
+    Mat distanceCPU = distance;\r
+\r
+    matches.clear();\r
+    matches.reserve(nQuery);\r
+\r
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)\r
+    {\r
+        matches.push_back(vector<DMatch>());\r
+        vector<DMatch>& curMatches = matches.back();\r
+        curMatches.reserve(k);\r
+\r
+        int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);\r
+        float* distance_ptr = distanceCPU.ptr<float>(queryIdx);\r
+        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)\r
+        {\r
+            int trainIdx = *trainIdx_ptr;\r
+\r
+            if (trainIdx != -1)\r
+            {\r
+                float distance = *distance_ptr;\r
+\r
+                DMatch m(queryIdx, trainIdx, 0, distance);\r
+\r
+                curMatches.push_back(m);\r
+            }\r
+        }\r
+\r
+        if (compactResult && curMatches.empty())\r
+            matches.pop_back();\r
+    }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+    vector< vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)\r
+{\r
+    GpuMat trainIdx, distance, allDist;\r
+    knnMatch(queryDescs, trainDescs, trainIdx, distance, allDist, k, mask);\r
+    knnMatchDownload(trainIdx, distance, matches, compactResult);\r
+}\r
+\r
+namespace\r
+{\r
+    class ImgIdxSetter\r
+    {\r
+    public:\r
+        ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}\r
+        void operator()(DMatch& m) const {m.imgIdx = imgIdx;}\r
+    private:\r
+        int imgIdx;\r
+    };\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, \r
+    vector< vector<DMatch> >& matches, int knn, const vector<GpuMat>& masks, bool compactResult)\r
+{\r
+    vector< vector<DMatch> > curMatches;\r
+    vector<DMatch> temp;\r
+    temp.reserve(2 * knn);\r
+\r
+    matches.resize(queryDescs.rows);\r
+    for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector<DMatch>::reserve), knn));\r
+\r
+    for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)\r
+    {\r
+        knnMatch(queryDescs, trainDescCollection[imgIdx], curMatches, knn, \r
+            masks.empty() ? GpuMat() : masks[imgIdx]);\r
+\r
+        for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)\r
+        {\r
+            vector<DMatch>& localMatch = curMatches[queryIdx];\r
+            vector<DMatch>& globalMatch = matches[queryIdx];\r
+\r
+            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));\r
+\r
+            temp.clear();\r
+            merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));\r
+\r
+            globalMatch.clear();\r
+            const size_t count = std::min((size_t)knn, temp.size());\r
+            copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));\r
+        }\r
+    }\r
+\r
+    if (compactResult)\r
+    {\r
+        vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), \r
+            mem_fun_ref(&vector<DMatch>::empty));\r
+        matches.erase(new_end, matches.end());\r
+    }\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// RadiusMatch\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+    GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask)\r
+{\r
+    using namespace cv::gpu::bfmatcher;\r
+\r
+    typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+\r
+    static const radiusMatch_caller_t radiusMatch_callers[2][8] = \r
+    {\r
+        {\r
+            radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>, \r
+            radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0\r
+        },\r
+        {\r
+            radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>, \r
+            radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0\r
+        }\r
+    };\r
+    \r
+    const int nQuery = queryDescs.rows;\r
+    const int nTrain = trainDescs.rows;\r
+    \r
+    CV_Assert(queryDescs.channels() == 1);\r
+    CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols);\r
+    CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery);    \r
+    \r
+    nMatches.create(1, nQuery, CV_32SC1);\r
+    nMatches.setTo(Scalar::all(0));\r
+    if (trainIdx.empty())\r
+    {\r
+        trainIdx.create(nQuery, nTrain, CV_32SC1);\r
+        distance.create(nQuery, nTrain, CV_32FC1);\r
+    }\r
+\r
+    radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];\r
+    CV_Assert(func != 0);\r
+\r
+    func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, \r
+    const GpuMat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult)\r
+{\r
+    const int nQuery = trainIdx.rows;\r
+\r
+    Mat trainIdxCPU = trainIdx;\r
+    Mat nMatchesCPU = nMatches;\r
+    Mat distanceCPU = distance;\r
+\r
+    matches.clear();\r
+    matches.reserve(nQuery);\r
+\r
+    const unsigned int* nMatches_ptr = nMatchesCPU.ptr<unsigned int>();\r
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)\r
+    {\r
+        const int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);\r
+        const float* distance_ptr = distanceCPU.ptr<float>(queryIdx);\r
+\r
+        const int nMatches = std::min(static_cast<int>(nMatches_ptr[queryIdx]), trainIdx.cols);\r
+\r
+        if (nMatches == 0)\r
+        {\r
+            if (!compactResult)\r
+                matches.push_back(vector<DMatch>());\r
+            continue;\r
+        }\r
+\r
+        matches.push_back(vector<DMatch>());\r
+        vector<DMatch>& curMatches = matches.back();\r
+        curMatches.reserve(nMatches);\r
+\r
+        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)\r
+        {\r
+            int trainIdx = *trainIdx_ptr;\r
+\r
+            float distance = *distance_ptr;\r
+            \r
+            DMatch m(queryIdx, trainIdx, 0, distance);\r
+\r
+            curMatches.push_back(m);\r
+        }\r
+        sort(curMatches.begin(), curMatches.end());\r
+    }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+    vector< vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)\r
+{\r
+    GpuMat trainIdx, nMatches, distance;\r
+    radiusMatch(queryDescs, trainDescs, trainIdx, nMatches, distance, maxDistance, mask);\r
+    radiusMatchDownload(trainIdx, nMatches, distance, matches, compactResult);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector<DMatch> >& matches, \r
+                float maxDistance, const vector<GpuMat>& masks, bool compactResult)\r
+\r
+{\r
+    matches.resize(queryDescs.rows);\r
+\r
+    vector< vector<DMatch> > curMatches;\r
+\r
+    for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)\r
+    {\r
+        radiusMatch(queryDescs, trainDescCollection[imgIdx], curMatches, maxDistance, \r
+            masks.empty() ? GpuMat() : masks[imgIdx]);\r
+\r
+        for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)\r
+        {\r
+            vector<DMatch>& localMatch = curMatches[queryIdx];\r
+            vector<DMatch>& globalMatch = matches[queryIdx];\r
+\r
+            for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));\r
+\r
+            const size_t oldSize = globalMatch.size();\r
+\r
+            copy(localMatch.begin(), localMatch.end(), back_inserter(globalMatch));\r
+            inplace_merge(globalMatch.begin(), globalMatch.begin() + oldSize, globalMatch.end());\r
+        }\r
+    }\r
+\r
+    if (compactResult)\r
+    {\r
+        vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), \r
+            mem_fun_ref(&vector<DMatch>::empty));\r
+        matches.erase(new_end, matches.end());\r
+    }\r
+}\r
+\r
+#endif /* !defined (HAVE_CUDA) */\r
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
new file mode 100644 (file)
index 0000000..c2e2c1c
--- /dev/null
@@ -0,0 +1,1205 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or bpied warranties, including, but not limited to, the bpied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "cuda_shared.hpp"\r
+#include "limits_gpu.hpp"\r
+\r
+using namespace cv::gpu;\r
+using namespace cv::gpu::device;\r
+\r
+namespace cv { namespace gpu { namespace bfmatcher\r
+{\r
+///////////////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////// General funcs //////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+    \r
+    template <bool expr> struct StaticAssert;\r
+    template <> struct StaticAssert<true> {static __host__ __device__ void check(){}};\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Mask strategy\r
+\r
+    class SingleMask\r
+    {\r
+    public:\r
+        explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}\r
+        \r
+        __device__ bool operator()(int queryIdx, int trainIdx) const\r
+        {            \r
+            return mask.ptr(queryIdx)[trainIdx] != 0;\r
+        }\r
+    private:\r
+        PtrStep mask;\r
+    };\r
+\r
+    class MaskCollection\r
+    {\r
+    public:\r
+        explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}\r
+\r
+        __device__ void nextMask()\r
+        {\r
+            curMask = *maskCollection++;\r
+        }\r
+        \r
+        __device__ bool operator()(int queryIdx, int trainIdx) const\r
+        {            \r
+            return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;\r
+        }\r
+    private:\r
+        PtrStep* maskCollection;\r
+        PtrStep curMask;\r
+    };\r
+\r
+    class WithOutMask\r
+    {\r
+    public:\r
+        __device__ void nextMask()\r
+        {\r
+        }\r
+        __device__ bool operator()(int queryIdx, int trainIdx) const\r
+        {\r
+            return true;\r
+        }\r
+    };\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Reduce Sum\r
+    \r
+    template <int BLOCK_DIM_X>\r
+    __device__ void reduceSum(float* sdiff, float mySum, int tid)\r
+    {\r
+        sdiff[tid] = mySum;\r
+        __syncthreads();\r
+\r
+        if (BLOCK_DIM_X == 512) \r
+        {\r
+            if (tid < 256) \r
+            { \r
+                sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); \r
+                sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();\r
+                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads();\r
+            }\r
+            volatile float* smem = sdiff;\r
+            smem[tid] = mySum += smem[tid + 32]; \r
+            smem[tid] = mySum += smem[tid + 16]; \r
+            smem[tid] = mySum += smem[tid +  8]; \r
+            smem[tid] = mySum += smem[tid +  4]; \r
+            smem[tid] = mySum += smem[tid +  2];\r
+            smem[tid] = mySum += smem[tid +  1]; \r
+        }\r
+        if (BLOCK_DIM_X == 256)\r
+        {\r
+            if (tid < 128) \r
+            { \r
+                sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); \r
+                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads();\r
+            }\r
+            volatile float* smem = sdiff;\r
+            smem[tid] = mySum += smem[tid + 32]; \r
+            smem[tid] = mySum += smem[tid + 16]; \r
+            smem[tid] = mySum += smem[tid +  8]; \r
+            smem[tid] = mySum += smem[tid +  4]; \r
+            smem[tid] = mySum += smem[tid +  2];\r
+            smem[tid] = mySum += smem[tid +  1];\r
+        }\r
+        if (BLOCK_DIM_X == 128)\r
+        {\r
+            if (tid <  64) \r
+            { \r
+                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads(); \r
+            }\r
+            volatile float* smem = sdiff;\r
+            smem[tid] = mySum += smem[tid + 32]; \r
+            smem[tid] = mySum += smem[tid + 16]; \r
+            smem[tid] = mySum += smem[tid +  8]; \r
+            smem[tid] = mySum += smem[tid +  4]; \r
+            smem[tid] = mySum += smem[tid +  2];\r
+            smem[tid] = mySum += smem[tid +  1];\r
+        }\r
+        \r
+        volatile float* smem = sdiff;\r
+        if (BLOCK_DIM_X == 64) \r
+        {\r
+            if (tid < 32) \r
+            {\r
+                smem[tid] = mySum += smem[tid + 32]; \r
+                smem[tid] = mySum += smem[tid + 16]; \r
+                smem[tid] = mySum += smem[tid +  8]; \r
+                smem[tid] = mySum += smem[tid +  4]; \r
+                smem[tid] = mySum += smem[tid +  2];\r
+                smem[tid] = mySum += smem[tid +  1];  \r
+            }\r
+        }\r
+        if (BLOCK_DIM_X == 32) \r
+        {\r
+            if (tid < 16) \r
+            {\r
+                smem[tid] = mySum += smem[tid + 16]; \r
+                smem[tid] = mySum += smem[tid +  8]; \r
+                smem[tid] = mySum += smem[tid +  4]; \r
+                smem[tid] = mySum += smem[tid +  2];\r
+                smem[tid] = mySum += smem[tid +  1];  \r
+            }\r
+        }\r
+        if (BLOCK_DIM_X == 16) \r
+        {\r
+            if (tid < 8) \r
+            {\r
+                smem[tid] = mySum += smem[tid +  8]; \r
+                smem[tid] = mySum += smem[tid +  4]; \r
+                smem[tid] = mySum += smem[tid +  2];\r
+                smem[tid] = mySum += smem[tid +  1];  \r
+            }\r
+        }\r
+        if (BLOCK_DIM_X == 8) \r
+        {\r
+            if (tid < 4) \r
+            {\r
+                smem[tid] = mySum += smem[tid +  4]; \r
+                smem[tid] = mySum += smem[tid +  2];\r
+                smem[tid] = mySum += smem[tid +  1];  \r
+            }\r
+        }\r
+        if (BLOCK_DIM_X == 4) \r
+        {\r
+            if (tid < 2) \r
+            {\r
+                smem[tid] = mySum += smem[tid +  2];\r
+                smem[tid] = mySum += smem[tid +  1];  \r
+            }\r
+        }\r
+        if (BLOCK_DIM_X == 2) \r
+        {\r
+            if (tid < 1) \r
+            {\r
+                smem[tid] = mySum += smem[tid +  1];  \r
+            }\r
+        }\r
+    }\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // loadDescsVals\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, typename T> \r
+    __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals)\r
+    {\r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+        if (tid < desc_len)\r
+        {\r
+            smem[tid] = (float)descs[tid];\r
+        }\r
+        __syncthreads();\r
+\r
+        #pragma unroll\r
+        for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)\r
+        {\r
+            *queryVals = smem[i];\r
+            ++queryVals;\r
+        }\r
+    }\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Distance\r
+\r
+    template <int BLOCK_DIM_X>\r
+    class L1Dist\r
+    {\r
+    public:\r
+        __device__ L1Dist() : mySum(0) {}\r
+\r
+        __device__ void reduceIter(float val1, float val2)\r
+        {\r
+            mySum += fabs(val1 - val2);\r
+        }\r
+\r
+        __device__ void reduceAll(float* sdiff, int tid)\r
+        {\r
+            reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);\r
+        }\r
+\r
+        static __device__ float finalResult(float res)\r
+        {\r
+            return res;\r
+        }\r
+    private:\r
+        float mySum;\r
+    };\r
+\r
+    template <int BLOCK_DIM_X>\r
+    class L2Dist\r
+    {\r
+    public:\r
+        __device__ L2Dist() : mySum(0) {}\r
+\r
+        __device__ void reduceIter(float val1, float val2)\r
+        {\r
+            float reg = val1 - val2;\r
+            mySum += reg * reg;\r
+        }\r
+\r
+        __device__ void reduceAll(float* sdiff, int tid)\r
+        {\r
+            reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);\r
+        }\r
+\r
+        static __device__ float finalResult(float res)\r
+        {\r
+            return sqrtf(res);\r
+        }\r
+    private:\r
+        float mySum;\r
+    };\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // reduceDescDiff\r
+\r
+    template <int BLOCK_DIM_X, typename Dist, typename T> \r
+    __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff)\r
+    {\r
+        const int tid = threadIdx.x;\r
+\r
+        Dist dist;\r
+\r
+        for (int i = tid; i < desc_len; i += BLOCK_DIM_X)\r
+            dist.reduceIter(queryDescs[i], trainDescs[i]);\r
+\r
+        dist.reduceAll(sdiff, tid);\r
+    }\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // reduceDescDiff_smem\r
+\r
+    template <int N> struct UnrollDescDiff\r
+    {\r
+        template <typename Dist, typename T>\r
+        static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, \r
+            int ind, int desc_len)\r
+        {\r
+            if (ind < desc_len)\r
+                dist.reduceIter(*queryVals, trainDescs[ind]);\r
+\r
+            ++queryVals;\r
+\r
+            UnrollDescDiff<N - 1>::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len);\r
+        }\r
+\r
+        template <typename Dist, typename T>\r
+        static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)\r
+        {\r
+            dist.reduceIter(*queryVals, *trainDescs);\r
+\r
+            ++queryVals;\r
+            trainDescs += blockDim.x;\r
+\r
+            UnrollDescDiff<N - 1>::calcWithoutCheck(dist, queryVals, trainDescs);\r
+        }\r
+    };\r
+    template <> struct UnrollDescDiff<0>\r
+    {\r
+        template <typename Dist, typename T>\r
+        static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, \r
+            int ind, int desc_len)\r
+        {\r
+        }\r
+\r
+        template <typename Dist, typename T>\r
+        static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)\r
+        {\r
+        }\r
+    };\r
+\r
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool WITH_OUT_CHECK> struct DescDiffCalculator;\r
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> \r
+    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>\r
+    {\r
+        template <typename Dist, typename T>\r
+        static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)\r
+        {\r
+            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(dist, queryVals, trainDescs, \r
+                threadIdx.x, desc_len);\r
+        }\r
+    };\r
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> \r
+    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>\r
+    {\r
+        template <typename Dist, typename T>\r
+        static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)\r
+        {\r
+            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(dist, queryVals, \r
+                trainDescs + threadIdx.x);\r
+        }\r
+    };\r
+\r
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>\r
+    __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff)\r
+    {\r
+        const int tid = threadIdx.x;\r
+        \r
+        Dist dist;\r
+\r
+        DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(dist, queryVals, \r
+            trainDescs, desc_len);\r
+        \r
+        dist.reduceAll(sdiff, tid);\r
+    }\r
+\r
+///////////////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////////// Match //////////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // warpReduceMin\r
+\r
+    template <int BLOCK_DIM_Y> \r
+    __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx)\r
+    {\r
+        float minSum = sdata[tid];\r
+\r
+        if (BLOCK_DIM_Y >= 64) \r
+        {\r
+            float reg = sdata[tid + 32];\r
+            if (reg < minSum)\r
+            {\r
+                sdata[tid] = minSum = reg;\r
+                strainIdx[tid] = strainIdx[tid + 32];\r
+                simgIdx[tid] = simgIdx[tid + 32];\r
+            }\r
+        }\r
+        if (BLOCK_DIM_Y >= 32) \r
+        {\r
+            float reg = sdata[tid + 16];\r
+            if (reg < minSum)\r
+            {\r
+                sdata[tid] = minSum = reg;\r
+                strainIdx[tid] = strainIdx[tid + 16];\r
+                simgIdx[tid] = simgIdx[tid + 16];\r
+            }\r
+        }\r
+        if (BLOCK_DIM_Y >= 16) \r
+        {\r
+            float reg = sdata[tid + 8];\r
+            if (reg < minSum)\r
+            {\r
+                sdata[tid] = minSum = reg;\r
+                strainIdx[tid] = strainIdx[tid + 8];\r
+                simgIdx[tid] = simgIdx[tid + 8];\r
+            }\r
+        }\r
+        if (BLOCK_DIM_Y >= 8) \r
+        { \r
+            float reg = sdata[tid + 4];\r
+            if (reg < minSum)\r
+            {\r
+                sdata[tid] = minSum = reg;\r
+                strainIdx[tid] = strainIdx[tid + 4];\r
+                simgIdx[tid] = simgIdx[tid + 4];\r
+            }\r
+        }\r
+        if (BLOCK_DIM_Y >= 4) \r
+        { \r
+            float reg = sdata[tid + 2];\r
+            if (reg < minSum)\r
+            {\r
+                sdata[tid] = minSum = reg;\r
+                strainIdx[tid] = strainIdx[tid + 2];\r
+                simgIdx[tid] = simgIdx[tid + 2];\r
+            }\r
+        }\r
+        if (BLOCK_DIM_Y >= 2) \r
+        {\r
+            float reg = sdata[tid + 1];\r
+            if (reg < minSum)\r
+            {\r
+                sdata[tid] = minSum = reg;\r
+                strainIdx[tid] = strainIdx[tid + 1];\r
+                simgIdx[tid] = simgIdx[tid + 1];\r
+            }\r
+        }\r
+    }\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // findBestMatch\r
+\r
+    template <int BLOCK_DIM_Y, typename Dist>\r
+    __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, \r
+        float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance)\r
+    {\r
+        if (threadIdx.x == 0)\r
+        {\r
+            smin[threadIdx.y] = myMin;\r
+            strainIdx[threadIdx.y] = myBestTrainIdx;\r
+            simgIdx[threadIdx.y] = myBestImgIdx;\r
+        }\r
+        __syncthreads();\r
+\r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+        if (tid < 32)\r
+            warpReduceMin<BLOCK_DIM_Y>(tid, smin, strainIdx, simgIdx);\r
+\r
+        if (threadIdx.x == 0 && threadIdx.y == 0)\r
+        {\r
+            float minSum = smin[0];\r
+            int bestTrainIdx = strainIdx[0];\r
+            int bestImgIdx = simgIdx[0];\r
+\r
+            imgIdx[queryIdx] = bestImgIdx;\r
+            trainIdx[queryIdx] = bestTrainIdx;\r
+            distance[queryIdx] = Dist::finalResult(minSum);\r
+        }\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // ReduceDescCalculator\r
+\r
+    template <int BLOCK_DIM_X, typename Dist, typename T>\r
+    class ReduceDescCalculatorSimple\r
+    {\r
+    public:\r
+        __device__ void prepare(const T* queryDescs_, int, float*)\r
+        {\r
+            queryDescs = queryDescs_;\r
+        }\r
+\r
+        __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const\r
+        {\r
+            reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, desc_len, sdiff_row);\r
+        }\r
+\r
+    private:\r
+        const T* queryDescs;\r
+    };\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
+        typename Dist, typename T>\r
+    class ReduceDescCalculatorSmem\r
+    {\r
+    public:\r
+        __device__ void prepare(const T* queryDescs, int desc_len, float* smem)\r
+        {\r
+            loadDescsVals<BLOCK_DIM_X, BLOCK_DIM_Y, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, smem, queryVals);\r
+        }\r
+\r
+        __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const\r
+        {\r
+            reduceDescDiff_smem<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist>(queryVals, trainDescs, \r
+                desc_len, sdiff_row);\r
+        }\r
+\r
+    private:\r
+        float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];\r
+    };\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // matchDescs loop\r
+\r
+    template <typename ReduceDescCalculator, typename T, typename Mask>\r
+    __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_<T>& trainDescs_,  \r
+        const Mask& m, const ReduceDescCalculator& reduceDescCalc,\r
+        float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx)\r
+    {\r
+        const T* trainDescs = trainDescs_.ptr(threadIdx.y);\r
+        const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T);\r
+        for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; \r
+             trainIdx += blockDim.y, trainDescs += trainDescsStep)\r
+        {\r
+            if (m(queryIdx, trainIdx))\r
+            {\r
+                reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row);\r
+\r
+                if (threadIdx.x == 0)\r
+                {\r
+                    float reg = sdiff_row[0];\r
+                    if (reg < myMin)\r
+                    {\r
+                        myMin = reg;\r
+                        myBestTrainIdx = trainIdx;\r
+                        myBestImgIdx = imgIdx;\r
+                    }\r
+                }\r
+            }\r
+        }\r
+    }\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Train collection loop strategy\r
+\r
+    template <typename T>\r
+    class SingleTrain\r
+    {\r
+    public:\r
+        explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)\r
+        {\r
+        }\r
+\r
+        template <typename ReduceDescCalculator, typename Mask>\r
+        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
+            float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const\r
+        {\r
+            matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, \r
+                sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+        }\r
+\r
+        __device__ int desc_len() const\r
+        {\r
+            return trainDescs.cols;\r
+        }\r
+    private:\r
+        DevMem2D_<T> trainDescs;\r
+    };\r
+\r
+    template <typename T>\r
+    class TrainCollection\r
+    {\r
+    public:\r
+        TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : \r
+            trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)\r
+        {\r
+        }\r
+\r
+        template <typename ReduceDescCalculator, typename Mask>\r
+        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
+            float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const\r
+        {\r
+            for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)\r
+            {\r
+                DevMem2D_<T> trainDescs = trainCollection[imgIdx];\r
+                m.nextMask();\r
+                matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, \r
+                    sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+            }\r
+        }\r
+\r
+        __device__ int desc_len() const\r
+        {\r
+            return desclen;\r
+        }\r
+    private:\r
+        const DevMem2D_<T>* trainCollection;\r
+        int nImg;\r
+        int desclen;\r
+    };\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Match kernel\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, \r
+        typename Train, typename Mask>\r
+    __global__ void match(PtrStep_<T> queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance)\r
+    {\r
+        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+        __shared__ float smin[64];\r
+        __shared__ int strainIdx[64];\r
+        __shared__ int simgIdx[64];\r
+        \r
+        const int queryIdx = blockIdx.x;\r
+        \r
+        int myBestTrainIdx = -1;\r
+        int myBestImgIdx = -1;\r
+        float myMin = numeric_limits_gpu<float>::max();\r
+\r
+        {\r
+            float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+            Mask m = mask;\r
+            ReduceDescCalculator reduceDescCalc;\r
+            reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff);\r
+        \r
+            train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+        }\r
+\r
+        findBestMatch<BLOCK_DIM_Y, Dist>(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, \r
+            smin, strainIdx, simgIdx, trainIdx, imgIdx, distance);\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Match kernel callers\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, \r
+        typename Train, typename Mask>\r
+    void match_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
+        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp\r
+\r
+        dim3 grid(queryDescs.rows, 1, 1);\r
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, Dist<BLOCK_DIM_X>, T>, \r
+            Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
+            imgIdx.data, distance.data);\r
+\r
+        cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
+        template <int> class Dist, typename T, typename Train, typename Mask>\r
+    void match_smem_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
+        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        StaticAssert<BLOCK_DIM_Y <= 64>::check();                                // blockDimY vals must reduce by warp\r
+        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length\r
+        StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX\r
+\r
+        dim3 grid(queryDescs.rows, 1, 1);\r
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSmem<BLOCK_DIM_X, BLOCK_DIM_Y, \r
+              MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist<BLOCK_DIM_X>, T>, \r
+              Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
+              imgIdx.data, distance.data);\r
+\r
+        cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Match kernel chooser\r
+\r
+    template <template <int> class Dist, typename T, typename Train, typename Mask>\r
+    void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, \r
+        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        if (queryDescs.cols < 64)\r
+            match_smem_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        else if (queryDescs.cols == 64)\r
+            match_smem_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        else if (queryDescs.cols < 128)\r
+            match_smem_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        else if (queryDescs.cols == 128)\r
+            match_smem_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        else if (queryDescs.cols < 256)\r
+            match_smem_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        else if (queryDescs.cols == 256)\r
+            match_smem_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        else\r
+            match_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+\r
+        cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+\r
+    template <typename T>\r
+    void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
+        if (mask.data)\r
+        {\r
+            SingleMask m(mask);\r
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);\r
+        }\r
+        else\r
+        {\r
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+        }\r
+    }\r
+\r
+    template void matchSingleL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+    template <typename T>\r
+    void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
+        if (mask.data)\r
+        {\r
+            SingleMask m(mask);\r
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);\r
+        }\r
+        else\r
+        {\r
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+        }\r
+    }\r
+\r
+    template void matchSingleL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchSingleL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+    template <typename T>\r
+    void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
+        if (maskCollection.data)\r
+        {\r
+            MaskCollection mask(maskCollection.data);\r
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        }\r
+        else\r
+        {\r
+            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+        }\r
+    }\r
+\r
+    template void matchCollectionL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+    template <typename T>\r
+    void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+        const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+    {\r
+        TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
+        if (maskCollection.data)\r
+        {\r
+            MaskCollection mask(maskCollection.data);\r
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+        }\r
+        else\r
+        {\r
+            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+        }\r
+    }\r
+\r
+    template void matchCollectionL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    template void matchCollectionL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+    \r
+///////////////////////////////////////////////////////////////////////////////////\r
+//////////////////////////////////// Knn Match ////////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Calc distance kernel\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
+    __global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)\r
+    {\r
+        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+\r
+        float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+        \r
+        const int queryIdx = blockIdx.x;\r
+        const T* queryDescs = queryDescs_.ptr(queryIdx);\r
+\r
+        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+\r
+        if (trainIdx < trainDescs_.rows)\r
+        {\r
+            const T* trainDescs = trainDescs_.ptr(trainIdx);\r
+\r
+            float dist = numeric_limits_gpu<float>::max();\r
+\r
+            if (mask(queryIdx, trainIdx))\r
+            {\r
+                reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);\r
+\r
+                if (threadIdx.x == 0)\r
+                {\r
+                    dist = Dist::finalResult(sdiff_row[0]);\r
+                }\r
+            }\r
+            \r
+            if (threadIdx.x == 0)\r
+                distance.ptr(queryIdx)[trainIdx] = dist;\r
+        }\r
+    }\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Calc distance kernel caller\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>\r
+    void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
+        const Mask& mask, const DevMem2Df& distance)\r
+    {\r
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+        dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);\r
+\r
+        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(\r
+            queryDescs, trainDescs, mask, distance);\r
+\r
+        cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+        \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // reduceMin\r
+\r
+    template <int BLOCK_SIZE> \r
+    __device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)\r
+    {\r
+        if (tid < 32)\r
+        {\r
+            if (BLOCK_SIZE >= 64) \r
+            { \r
+                float reg = sdist[tid + 32];\r
+\r
+                if (reg < myMin)\r
+                {\r
+                    sdist[tid] = myMin = reg;\r
+                    strainIdx[tid] = strainIdx[tid + 32];\r
+                }\r
+            }\r
+            if (BLOCK_SIZE >= 32) \r
+            { \r
+                float reg = sdist[tid + 16];\r
+\r
+                if (reg < myMin)\r
+                {\r
+                    sdist[tid] = myMin = reg;\r
+                    strainIdx[tid] = strainIdx[tid + 16];\r
+                }\r
+            }\r
+            if (BLOCK_SIZE >= 16) \r
+            { \r
+                float reg = sdist[tid + 8];\r
+\r
+                if (reg < myMin)\r
+                {\r
+                    sdist[tid] = myMin = reg;\r
+                    strainIdx[tid] = strainIdx[tid + 8];\r
+                }\r
+            }\r
+            if (BLOCK_SIZE >= 8) \r
+            { \r
+                float reg = sdist[tid + 4];\r
+\r
+                if (reg < myMin)\r
+                {\r
+                    sdist[tid] = myMin = reg;\r
+                    strainIdx[tid] = strainIdx[tid + 4];\r
+                }\r
+            }\r
+            if (BLOCK_SIZE >= 4) \r
+            { \r
+                float reg = sdist[tid + 2];\r
+\r
+                if (reg < myMin)\r
+                {\r
+                    sdist[tid] = myMin = reg;\r
+                    strainIdx[tid] = strainIdx[tid + 2];\r
+                } \r
+            }\r
+            if (BLOCK_SIZE >= 2) \r
+            { \r
+                float reg = sdist[tid + 1];\r
+\r
+                if (reg < myMin)\r
+                {\r
+                    sdist[tid] = myMin = reg;\r
+                    strainIdx[tid] = strainIdx[tid + 1];\r
+                }\r
+            }\r
+        }\r
+    }\r
+    \r
+    template <int BLOCK_SIZE> \r
+    __device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx)\r
+    {\r
+        const int tid = threadIdx.x;\r
+        \r
+        float myMin = numeric_limits_gpu<float>::max();\r
+        int myMinIdx = -1;\r
+\r
+        for (int i = tid; i < n; i += BLOCK_SIZE)\r
+        {\r
+            float reg = dist[i];\r
+            if (reg < myMin)\r
+            {\r
+                myMin = reg;\r
+                myMinIdx = i;\r
+            }\r
+        }\r
+\r
+        sdist[tid] = myMin;\r
+        strainIdx[tid] = myMinIdx;\r
+        __syncthreads();\r
+\r
+        if (BLOCK_SIZE >= 512 && tid < 256) \r
+        {\r
+            float reg = sdist[tid + 256];\r
+\r
+            if (reg < myMin)\r
+            {\r
+                sdist[tid] = myMin = reg;\r
+                strainIdx[tid] = strainIdx[tid + 256];\r
+            }\r
+            __syncthreads(); \r
+        }\r
+        if (BLOCK_SIZE >= 256 && tid < 128) \r
+        {\r
+            float reg = sdist[tid + 128];\r
+\r
+            if (reg < myMin)\r
+            {\r
+                sdist[tid] = myMin = reg;\r
+                strainIdx[tid] = strainIdx[tid + 128];\r
+            }\r
+            __syncthreads(); \r
+        }\r
+        if (BLOCK_SIZE >= 128 && tid < 64) \r
+        {\r
+            float reg = sdist[tid + 64];\r
+\r
+            if (reg < myMin)\r
+            {\r
+                sdist[tid] = myMin = reg;\r
+                strainIdx[tid] = strainIdx[tid + 64];\r
+            }\r
+            __syncthreads(); \r
+        }\r
+        \r
+        warpReduceMinIdx<BLOCK_SIZE>(sdist, strainIdx, myMin, tid);\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // find knn match kernel\r
+\r
+    template <int BLOCK_SIZE>\r
+    __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)\r
+    {\r
+        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
+        __shared__ float sdist[SMEM_SIZE];\r
+        __shared__ int strainIdx[SMEM_SIZE];\r
+\r
+        const int queryIdx = blockIdx.x;\r
+\r
+        float* allDist = allDist_.ptr(queryIdx);\r
+        int* trainIdx = trainIdx_.ptr(queryIdx);\r
+        float* distance = distance_.ptr(queryIdx);\r
+\r
+        reduceMinIdx<BLOCK_SIZE>(allDist, allDist_.cols, sdist, strainIdx);\r
+\r
+        if (threadIdx.x == 0)\r
+        {\r
+            float dist = sdist[0];\r
+            if (dist < numeric_limits_gpu<float>::max())\r
+            {\r
+                int bestIdx = strainIdx[0];\r
+                allDist[bestIdx] = numeric_limits_gpu<float>::max();\r
+                trainIdx[i] = bestIdx;\r
+                distance[i] = dist;\r
+            }\r
+        }\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // find knn match kernel caller\r
+\r
+    template <int BLOCK_SIZE>\r
+    void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+    {\r
+        dim3 threads(BLOCK_SIZE, 1, 1);\r
+        dim3 grid(trainIdx.rows, 1, 1);\r
+\r
+        for (int i = 0; i < knn; ++i)\r
+            findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);\r
+        \r
+        cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // knn match caller\r
+\r
+    template <typename T>\r
+    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+    {\r
+        if (mask.data)\r
+        {\r
+            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                SingleMask(mask), allDist);\r
+        }\r
+        else\r
+        {\r
+            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                WithOutMask(), allDist);\r
+        }\r
+\r
+        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
+    }\r
+\r
+    template void knnMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+    template <typename T>\r
+    void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+    {\r
+        if (mask.data)\r
+        {\r
+            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                SingleMask(mask), allDist);\r
+        }\r
+        else\r
+        {\r
+            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                WithOutMask(), allDist);\r
+        }\r
+\r
+        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
+    }\r
+\r
+    template void knnMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+    template void knnMatchL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+///////////////////////////////////////////////////////////////////////////////////\r
+/////////////////////////////////// Radius Match //////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Radius Match kernel\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
+    __global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, \r
+        float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)\r
+    {\r
+        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+\r
+        float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+        \r
+        const int queryIdx = blockIdx.x;\r
+        const T* queryDescs = queryDescs_.ptr(queryIdx);\r
+\r
+        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+        if (trainIdx < trainDescs_.rows)\r
+        {\r
+            const T* trainDescs = trainDescs_.ptr(trainIdx);\r
+\r
+            if (mask(queryIdx, trainIdx))\r
+            {\r
+                reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);\r
+\r
+                if (threadIdx.x == 0)\r
+                {\r
+                    float dist = Dist::finalResult(sdiff_row[0]);\r
+                    if (dist < maxDistance)\r
+                    {\r
+                        unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
+                        if (i < trainIdx_.cols)\r
+                        {\r
+                            distance.ptr(queryIdx)[i] = dist;\r
+                            trainIdx_.ptr(queryIdx)[i] = trainIdx;\r
+                        }\r
+                    }\r
+                }\r
+            }\r
+        }\r
+    }\r
+        \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Radius Match kernel caller\r
+\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>\r
+    void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
+        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, \r
+        const DevMem2Df& distance)\r
+    {\r
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+        dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);\r
+\r
+        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(\r
+            queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);\r
+\r
+        cudaSafeCall( cudaThreadSynchronize() );\r
+    }\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Radius Match kernel chooser\r
+\r
+    template <typename T>\r
+    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)\r
+    {\r
+        if (mask.data)\r
+        {\r
+            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);\r
+        }\r
+        else\r
+        {\r
+            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                maxDistance, WithOutMask(), trainIdx, nMatches, distance);\r
+        }\r
+    }\r
+\r
+    template void radiusMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL1_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL1_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL1_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL1_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+\r
+    template <typename T>\r
+    void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+        const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)\r
+    {\r
+        if (mask.data)\r
+        {\r
+            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);\r
+        }\r
+        else\r
+        {\r
+            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+                maxDistance, WithOutMask(), trainIdx, nMatches, distance);\r
+        }\r
+    }\r
+\r
+    template void radiusMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL2_gpu<char          >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL2_gpu<short         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL2_gpu<int           >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+    template void radiusMatchL2_gpu<float         >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+}}}\r
diff --git a/tests/gpu/src/brute_force_matcher.cpp b/tests/gpu/src/brute_force_matcher.cpp
new file mode 100644 (file)
index 0000000..b4867e1
--- /dev/null
@@ -0,0 +1,175 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                        Intel License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000, Intel Corporation, all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of Intel Corporation may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "gputest.hpp"\r
+\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+using namespace std;\r
+\r
+class CV_GpuBruteForceMatcherTest : public CvTest \r
+{\r
+public:\r
+    CV_GpuBruteForceMatcherTest() : CvTest( "GPU-BruteForceMatcher", "BruteForceMatcher" ) {}\r
+\r
+protected:\r
+    void run(int) \r
+    {\r
+        try \r
+        {\r
+            BruteForceMatcher< L2<float> > matcherCPU;\r
+            BruteForceMatcher_GPU< L2<float> > matcherGPU;\r
+            \r
+            vector<DMatch> matchesCPU, matchesGPU;\r
+            vector< vector<DMatch> > knnMatchesCPU, knnMatchesGPU;\r
+            vector< vector<DMatch> > radiusMatchesCPU, radiusMatchesGPU;\r
+\r
+            RNG rng(*ts->get_rng());\r
+\r
+            const int desc_len = rng.uniform(40, 300);\r
+\r
+            Mat queryCPU(rng.uniform(100, 300), desc_len, CV_32F);            \r
+            rng.fill(queryCPU, cv::RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar::all(1.0));\r
+            GpuMat queryGPU(queryCPU);\r
+\r
+            const int nTrains = rng.uniform(1, 5);\r
+\r
+            vector<Mat> trainsCPU(nTrains);\r
+            vector<GpuMat> trainsGPU(nTrains);\r
+\r
+            vector<Mat> masksCPU(nTrains);\r
+            vector<GpuMat> masksGPU(nTrains);\r
+\r
+            for (int i = 0; i < nTrains; ++i)\r
+            {\r
+                Mat train(rng.uniform(100, 300), desc_len, CV_32F);\r
+                rng.fill(train, cv::RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar::all(1.0));\r
+\r
+                trainsCPU[i] = train;\r
+                trainsGPU[i].upload(train);\r
+\r
+                bool with_mask = rng.uniform(0, 10) < 5;\r
+                if (with_mask)\r
+                {\r
+                    Mat mask(queryCPU.rows, train.rows, CV_8U, Scalar::all(1));\r
+                    rng.fill(mask, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(200));\r
+\r
+                    masksCPU[i] = mask;\r
+                    masksGPU[i].upload(mask);\r
+                }\r
+            }\r
+\r
+            matcherCPU.add(trainsCPU);\r
+            matcherGPU.add(trainsGPU);\r
+\r
+            matcherCPU.match(queryCPU, matchesCPU, masksCPU);\r
+            matcherGPU.match(queryGPU, matchesGPU, masksGPU);\r
+\r
+            if (!compareMatches(matchesCPU, matchesGPU))\r
+            {\r
+                ts->set_failed_test_info(CvTS::FAIL_MISMATCH);\r
+                return;\r
+            }\r
+\r
+            const int knn = rng.uniform(3, 10);\r
+\r
+            matcherCPU.knnMatch(queryCPU, knnMatchesCPU, knn, masksCPU);\r
+            matcherGPU.knnMatch(queryGPU, knnMatchesGPU, knn, masksGPU);\r
+\r
+            if (!compareMatches(knnMatchesCPU, knnMatchesGPU))\r
+            {\r
+                ts->set_failed_test_info(CvTS::FAIL_MISMATCH);\r
+                return;\r
+            }\r
+\r
+            const float maxDistance = rng.uniform(0.01f, 0.3f);\r
+            \r
+            matcherCPU.radiusMatch(queryCPU, radiusMatchesCPU, maxDistance, masksCPU);\r
+            matcherGPU.radiusMatch(queryGPU, radiusMatchesGPU, maxDistance, masksGPU);\r
+\r
+            if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU))\r
+            {\r
+                ts->set_failed_test_info(CvTS::FAIL_MISMATCH);\r
+                return;\r
+            }\r
+        }\r
+        catch (const cv::Exception& e) \r
+        {\r
+            if (!check_and_treat_gpu_exception(e, ts))\r
+                throw;\r
+            return;\r
+        }\r
+\r
+        ts->set_failed_test_info(CvTS::OK);\r
+    }\r
+\r
+private:\r
+    static void convertMatches(const vector< vector<DMatch> >& knnMatches, vector<DMatch>& matches)\r
+    {\r
+        matches.clear();\r
+        for (size_t i = 0; i < knnMatches.size(); ++i)\r
+            copy(knnMatches[i].begin(), knnMatches[i].end(), back_inserter(matches));\r
+    }\r
+\r
+    static bool compareMatches(const vector<DMatch>& matches1, const vector<DMatch>& matches2)\r
+    {\r
+        if (matches1.size() != matches2.size())\r
+            return false;\r
+\r
+        struct DMatchEqual : public binary_function<DMatch, DMatch, bool>\r
+        {\r
+            bool operator()(const DMatch& m1, const DMatch& m2)\r
+            {\r
+                return m1.imgIdx == m2.imgIdx && m1.queryIdx == m2.queryIdx && m1.trainIdx == m2.trainIdx;\r
+            }\r
+        };\r
+\r
+        return equal(matches1.begin(), matches1.end(), matches2.begin(), DMatchEqual());\r
+    }\r
+\r
+    static bool compareMatches(const vector< vector<DMatch> >& matches1, const vector< vector<DMatch> >& matches2)\r
+    {\r
+        vector<DMatch> m1, m2;\r
+        convertMatches(matches1, m1);\r
+        convertMatches(matches2, m2);\r
+        return compareMatches(m1, m2);\r
+    }\r
+} brute_force_matcher_test;
\ No newline at end of file
index fea8ef9..cfb2b33 100644 (file)
@@ -50,7 +50,8 @@
 
 #include <opencv2/gpu/gpu.hpp>
 #include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc.hpp>\r
+#include <opencv2/features2d/features2d.hpp>
 #include "cxts.h"
 
 /****************************************************************************************/