--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other GpuMaterials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or bpied warranties, including, but not limited to, the bpied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "precomp.hpp"\r
+\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+using namespace std;\r
+\r
+#if !defined (HAVE_CUDA)\r
+\r
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>&) { throw_nogpu(); }\r
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }\r
+void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); }\r
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; }\r
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }\r
+\r
+#else /* !defined (HAVE_CUDA) */\r
+\r
+namespace cv { namespace gpu { namespace bfmatcher\r
+{\r
+ template <typename T>\r
+ void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template <typename T>\r
+ void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template <typename T>\r
+ void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+ const DevMem2Df& distance);\r
+ template <typename T>\r
+ void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+ const DevMem2Df& distance);\r
+\r
+ template <typename T>\r
+ void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template <typename T>\r
+ void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+ template <typename T>\r
+ void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template <typename T>\r
+ void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+}}}\r
+\r
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)\r
+{\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// Train collection\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)\r
+{\r
+ trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());\r
+}\r
+\r
+const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const \r
+{\r
+ return trainDescCollection;\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::clear() \r
+{\r
+ trainDescCollection.clear();\r
+}\r
+\r
+bool cv::gpu::BruteForceMatcher_GPU_base::empty() const\r
+{\r
+ return trainDescCollection.empty();\r
+}\r
+\r
+bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const\r
+{\r
+ return true;\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// Match\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+ GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask)\r
+{\r
+ using namespace cv::gpu::bfmatcher;\r
+\r
+ typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+ static const match_caller_t match_callers[2][8] = \r
+ {\r
+ {\r
+ matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>, \r
+ matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0\r
+ },\r
+ {\r
+ matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>, \r
+ matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0\r
+ }\r
+ };\r
+ \r
+ CV_Assert(queryDescs.channels() == 1);\r
+ CV_Assert(trainDescs.cols == queryDescs.cols && trainDescs.type() == queryDescs.type());\r
+ \r
+ const int nQuery = queryDescs.rows;\r
+\r
+ trainIdx.create(1, nQuery, CV_32S);\r
+ distance.create(1, nQuery, CV_32F);\r
+\r
+ match_caller_t func = match_callers[distType][queryDescs.depth()];\r
+ CV_Assert(func != 0);\r
+\r
+ // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. \r
+ // trainIdx store after imgIdx, so we doesn't lose it value.\r
+ func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, \r
+ vector<DMatch>& matches)\r
+{\r
+ const int nQuery = trainIdx.cols;\r
+\r
+ Mat trainIdxCPU = trainIdx;\r
+ Mat distanceCPU = distance;\r
+\r
+ matches.clear();\r
+ matches.reserve(nQuery);\r
+\r
+ const int* trainIdx_ptr = trainIdxCPU.ptr<int>();\r
+ const float* distance_ptr = distanceCPU.ptr<float>();\r
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)\r
+ {\r
+ int trainIdx = *trainIdx_ptr;\r
+ if (trainIdx == -1)\r
+ continue;\r
+\r
+ float distance = *distance_ptr;\r
+\r
+ DMatch m(queryIdx, trainIdx, 0, distance);\r
+\r
+ matches.push_back(m);\r
+ }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+ vector<DMatch>& matches, const GpuMat& mask)\r
+{\r
+ GpuMat trainIdx, distance;\r
+ matchSingle(queryDescs, trainDescs, trainIdx, distance, mask);\r
+ matchDownload(trainIdx, distance, matches);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, \r
+ const vector<GpuMat>& masks)\r
+{\r
+ if (masks.empty())\r
+ { \r
+ Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));\r
+\r
+ for (size_t i = 0; i < trainDescCollection.size(); ++i)\r
+ {\r
+ const GpuMat& trainDescs = trainDescCollection[i];\r
+\r
+ trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;\r
+ }\r
+\r
+ trainCollection.upload(trainCollectionCPU);\r
+ }\r
+ else\r
+ {\r
+ CV_Assert(masks.size() == trainDescCollection.size());\r
+ \r
+ Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));\r
+ Mat maskCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(PtrStep)));\r
+\r
+ for (size_t i = 0; i < trainDescCollection.size(); ++i)\r
+ {\r
+ const GpuMat& trainDescs = trainDescCollection[i];\r
+ const GpuMat& mask = masks[i];\r
+\r
+ CV_Assert(mask.empty() || (mask.type() == CV_8UC1));\r
+\r
+ trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;\r
+\r
+ maskCollectionCPU.ptr<PtrStep>(0)[i] = static_cast<PtrStep>(mask);\r
+ }\r
+\r
+ trainCollection.upload(trainCollectionCPU);\r
+ maskCollection.upload(maskCollectionCPU);\r
+ }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, \r
+ GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection)\r
+{\r
+ using namespace cv::gpu::bfmatcher;\r
+\r
+ typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, \r
+ const DevMem2Df& distance);\r
+\r
+ static const match_caller_t match_callers[2][8] = \r
+ {\r
+ {\r
+ matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>, \r
+ matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>, \r
+ matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0\r
+ },\r
+ {\r
+ matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>, \r
+ matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>, \r
+ matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0\r
+ }\r
+ };\r
+ \r
+ CV_Assert(queryDescs.channels() == 1);\r
+ \r
+ const int nQuery = queryDescs.rows;\r
+\r
+ trainIdx.create(1, nQuery, CV_32S);\r
+ imgIdx.create(1, nQuery, CV_32S);\r
+ distance.create(1, nQuery, CV_32F);\r
+\r
+ match_caller_t func = match_callers[distType][queryDescs.depth()];\r
+ CV_Assert(func != 0);\r
+\r
+ func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, \r
+ const GpuMat& distance, vector<DMatch>& matches)\r
+{\r
+ const int nQuery = trainIdx.cols;\r
+\r
+ Mat trainIdxCPU = trainIdx;\r
+ Mat imgIdxCPU = imgIdx;\r
+ Mat distanceCPU = distance;\r
+\r
+ matches.clear();\r
+ matches.reserve(nQuery);\r
+\r
+ const int* trainIdx_ptr = trainIdxCPU.ptr<int>();\r
+ const int* imgIdx_ptr = imgIdxCPU.ptr<int>();\r
+ const float* distance_ptr = distanceCPU.ptr<float>();\r
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)\r
+ {\r
+ int trainIdx = *trainIdx_ptr;\r
+ if (trainIdx == -1)\r
+ continue;\r
+\r
+ int imgIdx = *imgIdx_ptr;\r
+\r
+ float distance = *distance_ptr;\r
+\r
+ DMatch m(queryIdx, trainIdx, imgIdx, distance);\r
+\r
+ matches.push_back(m);\r
+ }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector<DMatch>& matches, \r
+ const vector<GpuMat>& masks)\r
+{ \r
+ GpuMat trainCollection;\r
+ GpuMat maskCollection;\r
+\r
+ makeGpuCollection(trainCollection, maskCollection, masks);\r
+\r
+ GpuMat trainIdx, imgIdx, distance;\r
+\r
+ matchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, maskCollection);\r
+ matchDownload(trainIdx, imgIdx, distance, matches);\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// KnnMatch\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+ GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask)\r
+{\r
+ using namespace cv::gpu::bfmatcher;\r
+\r
+ typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+ static const match_caller_t match_callers[2][8] = \r
+ {\r
+ {\r
+ knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>, \r
+ knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0\r
+ },\r
+ {\r
+ knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>, \r
+ knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0\r
+ }\r
+ };\r
+ \r
+ CV_Assert(queryDescs.channels() == 1);\r
+ \r
+ const int nQuery = queryDescs.rows;\r
+ const int nTrain = trainDescs.rows;\r
+\r
+ trainIdx.create(nQuery, k, CV_32S);\r
+ trainIdx.setTo(Scalar::all(-1));\r
+ distance.create(nQuery, k, CV_32F);\r
+\r
+ allDist.create(nQuery, nTrain, CV_32F);\r
+\r
+ match_caller_t func = match_callers[distType][queryDescs.depth()];\r
+ CV_Assert(func != 0);\r
+\r
+ func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,\r
+ vector< vector<DMatch> >& matches, bool compactResult)\r
+{\r
+ const int nQuery = distance.rows;\r
+ const int k = trainIdx.cols;\r
+\r
+ Mat trainIdxCPU = trainIdx;\r
+ Mat distanceCPU = distance;\r
+\r
+ matches.clear();\r
+ matches.reserve(nQuery);\r
+\r
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)\r
+ {\r
+ matches.push_back(vector<DMatch>());\r
+ vector<DMatch>& curMatches = matches.back();\r
+ curMatches.reserve(k);\r
+\r
+ int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);\r
+ float* distance_ptr = distanceCPU.ptr<float>(queryIdx);\r
+ for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)\r
+ {\r
+ int trainIdx = *trainIdx_ptr;\r
+\r
+ if (trainIdx != -1)\r
+ {\r
+ float distance = *distance_ptr;\r
+\r
+ DMatch m(queryIdx, trainIdx, 0, distance);\r
+\r
+ curMatches.push_back(m);\r
+ }\r
+ }\r
+\r
+ if (compactResult && curMatches.empty())\r
+ matches.pop_back();\r
+ }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+ vector< vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)\r
+{\r
+ GpuMat trainIdx, distance, allDist;\r
+ knnMatch(queryDescs, trainDescs, trainIdx, distance, allDist, k, mask);\r
+ knnMatchDownload(trainIdx, distance, matches, compactResult);\r
+}\r
+\r
+namespace\r
+{\r
+ class ImgIdxSetter\r
+ {\r
+ public:\r
+ ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}\r
+ void operator()(DMatch& m) const {m.imgIdx = imgIdx;}\r
+ private:\r
+ int imgIdx;\r
+ };\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, \r
+ vector< vector<DMatch> >& matches, int knn, const vector<GpuMat>& masks, bool compactResult)\r
+{\r
+ vector< vector<DMatch> > curMatches;\r
+ vector<DMatch> temp;\r
+ temp.reserve(2 * knn);\r
+\r
+ matches.resize(queryDescs.rows);\r
+ for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector<DMatch>::reserve), knn));\r
+\r
+ for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)\r
+ {\r
+ knnMatch(queryDescs, trainDescCollection[imgIdx], curMatches, knn, \r
+ masks.empty() ? GpuMat() : masks[imgIdx]);\r
+\r
+ for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)\r
+ {\r
+ vector<DMatch>& localMatch = curMatches[queryIdx];\r
+ vector<DMatch>& globalMatch = matches[queryIdx];\r
+\r
+ for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));\r
+\r
+ temp.clear();\r
+ merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));\r
+\r
+ globalMatch.clear();\r
+ const size_t count = std::min((size_t)knn, temp.size());\r
+ copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));\r
+ }\r
+ }\r
+\r
+ if (compactResult)\r
+ {\r
+ vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), \r
+ mem_fun_ref(&vector<DMatch>::empty));\r
+ matches.erase(new_end, matches.end());\r
+ }\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////\r
+// RadiusMatch\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+ GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask)\r
+{\r
+ using namespace cv::gpu::bfmatcher;\r
+\r
+ typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+\r
+ static const radiusMatch_caller_t radiusMatch_callers[2][8] = \r
+ {\r
+ {\r
+ radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>, \r
+ radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0\r
+ },\r
+ {\r
+ radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>, \r
+ radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0\r
+ }\r
+ };\r
+ \r
+ const int nQuery = queryDescs.rows;\r
+ const int nTrain = trainDescs.rows;\r
+ \r
+ CV_Assert(queryDescs.channels() == 1);\r
+ CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols);\r
+ CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery); \r
+ \r
+ nMatches.create(1, nQuery, CV_32SC1);\r
+ nMatches.setTo(Scalar::all(0));\r
+ if (trainIdx.empty())\r
+ {\r
+ trainIdx.create(nQuery, nTrain, CV_32SC1);\r
+ distance.create(nQuery, nTrain, CV_32FC1);\r
+ }\r
+\r
+ radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];\r
+ CV_Assert(func != 0);\r
+\r
+ func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, \r
+ const GpuMat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult)\r
+{\r
+ const int nQuery = trainIdx.rows;\r
+\r
+ Mat trainIdxCPU = trainIdx;\r
+ Mat nMatchesCPU = nMatches;\r
+ Mat distanceCPU = distance;\r
+\r
+ matches.clear();\r
+ matches.reserve(nQuery);\r
+\r
+ const unsigned int* nMatches_ptr = nMatchesCPU.ptr<unsigned int>();\r
+ for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)\r
+ {\r
+ const int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);\r
+ const float* distance_ptr = distanceCPU.ptr<float>(queryIdx);\r
+\r
+ const int nMatches = std::min(static_cast<int>(nMatches_ptr[queryIdx]), trainIdx.cols);\r
+\r
+ if (nMatches == 0)\r
+ {\r
+ if (!compactResult)\r
+ matches.push_back(vector<DMatch>());\r
+ continue;\r
+ }\r
+\r
+ matches.push_back(vector<DMatch>());\r
+ vector<DMatch>& curMatches = matches.back();\r
+ curMatches.reserve(nMatches);\r
+\r
+ for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)\r
+ {\r
+ int trainIdx = *trainIdx_ptr;\r
+\r
+ float distance = *distance_ptr;\r
+ \r
+ DMatch m(queryIdx, trainIdx, 0, distance);\r
+\r
+ curMatches.push_back(m);\r
+ }\r
+ sort(curMatches.begin(), curMatches.end());\r
+ }\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, \r
+ vector< vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)\r
+{\r
+ GpuMat trainIdx, nMatches, distance;\r
+ radiusMatch(queryDescs, trainDescs, trainIdx, nMatches, distance, maxDistance, mask);\r
+ radiusMatchDownload(trainIdx, nMatches, distance, matches, compactResult);\r
+}\r
+\r
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector<DMatch> >& matches, \r
+ float maxDistance, const vector<GpuMat>& masks, bool compactResult)\r
+\r
+{\r
+ matches.resize(queryDescs.rows);\r
+\r
+ vector< vector<DMatch> > curMatches;\r
+\r
+ for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)\r
+ {\r
+ radiusMatch(queryDescs, trainDescCollection[imgIdx], curMatches, maxDistance, \r
+ masks.empty() ? GpuMat() : masks[imgIdx]);\r
+\r
+ for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)\r
+ {\r
+ vector<DMatch>& localMatch = curMatches[queryIdx];\r
+ vector<DMatch>& globalMatch = matches[queryIdx];\r
+\r
+ for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));\r
+\r
+ const size_t oldSize = globalMatch.size();\r
+\r
+ copy(localMatch.begin(), localMatch.end(), back_inserter(globalMatch));\r
+ inplace_merge(globalMatch.begin(), globalMatch.begin() + oldSize, globalMatch.end());\r
+ }\r
+ }\r
+\r
+ if (compactResult)\r
+ {\r
+ vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), \r
+ mem_fun_ref(&vector<DMatch>::empty));\r
+ matches.erase(new_end, matches.end());\r
+ }\r
+}\r
+\r
+#endif /* !defined (HAVE_CUDA) */\r
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+// By downloading, copying, installing or using the software you agree to this license.\r
+// If you do not agree to this license, do not download, install,\r
+// copy or use the software.\r
+//\r
+//\r
+// License Agreement\r
+// For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+// * Redistribution's of source code must retain the above copyright notice,\r
+// this list of conditions and the following disclaimer.\r
+//\r
+// * Redistribution's in binary form must reproduce the above copyright notice,\r
+// this list of conditions and the following disclaimer in the documentation\r
+// and/or other materials provided with the distribution.\r
+//\r
+// * The name of the copyright holders may not be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or bpied warranties, including, but not limited to, the bpied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "cuda_shared.hpp"\r
+#include "limits_gpu.hpp"\r
+\r
+using namespace cv::gpu;\r
+using namespace cv::gpu::device;\r
+\r
+namespace cv { namespace gpu { namespace bfmatcher\r
+{\r
+///////////////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////// General funcs //////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+ \r
+ template <bool expr> struct StaticAssert;\r
+ template <> struct StaticAssert<true> {static __host__ __device__ void check(){}};\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Mask strategy\r
+\r
+ class SingleMask\r
+ {\r
+ public:\r
+ explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}\r
+ \r
+ __device__ bool operator()(int queryIdx, int trainIdx) const\r
+ { \r
+ return mask.ptr(queryIdx)[trainIdx] != 0;\r
+ }\r
+ private:\r
+ PtrStep mask;\r
+ };\r
+\r
+ class MaskCollection\r
+ {\r
+ public:\r
+ explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}\r
+\r
+ __device__ void nextMask()\r
+ {\r
+ curMask = *maskCollection++;\r
+ }\r
+ \r
+ __device__ bool operator()(int queryIdx, int trainIdx) const\r
+ { \r
+ return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;\r
+ }\r
+ private:\r
+ PtrStep* maskCollection;\r
+ PtrStep curMask;\r
+ };\r
+\r
+ class WithOutMask\r
+ {\r
+ public:\r
+ __device__ void nextMask()\r
+ {\r
+ }\r
+ __device__ bool operator()(int queryIdx, int trainIdx) const\r
+ {\r
+ return true;\r
+ }\r
+ };\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Reduce Sum\r
+ \r
+ template <int BLOCK_DIM_X>\r
+ __device__ void reduceSum(float* sdiff, float mySum, int tid)\r
+ {\r
+ sdiff[tid] = mySum;\r
+ __syncthreads();\r
+\r
+ if (BLOCK_DIM_X == 512) \r
+ {\r
+ if (tid < 256) \r
+ { \r
+ sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); \r
+ sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();\r
+ sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();\r
+ }\r
+ volatile float* smem = sdiff;\r
+ smem[tid] = mySum += smem[tid + 32]; \r
+ smem[tid] = mySum += smem[tid + 16]; \r
+ smem[tid] = mySum += smem[tid + 8]; \r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ if (BLOCK_DIM_X == 256)\r
+ {\r
+ if (tid < 128) \r
+ { \r
+ sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); \r
+ sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();\r
+ }\r
+ volatile float* smem = sdiff;\r
+ smem[tid] = mySum += smem[tid + 32]; \r
+ smem[tid] = mySum += smem[tid + 16]; \r
+ smem[tid] = mySum += smem[tid + 8]; \r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1];\r
+ }\r
+ if (BLOCK_DIM_X == 128)\r
+ {\r
+ if (tid < 64) \r
+ { \r
+ sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); \r
+ }\r
+ volatile float* smem = sdiff;\r
+ smem[tid] = mySum += smem[tid + 32]; \r
+ smem[tid] = mySum += smem[tid + 16]; \r
+ smem[tid] = mySum += smem[tid + 8]; \r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1];\r
+ }\r
+ \r
+ volatile float* smem = sdiff;\r
+ if (BLOCK_DIM_X == 64) \r
+ {\r
+ if (tid < 32) \r
+ {\r
+ smem[tid] = mySum += smem[tid + 32]; \r
+ smem[tid] = mySum += smem[tid + 16]; \r
+ smem[tid] = mySum += smem[tid + 8]; \r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ }\r
+ if (BLOCK_DIM_X == 32) \r
+ {\r
+ if (tid < 16) \r
+ {\r
+ smem[tid] = mySum += smem[tid + 16]; \r
+ smem[tid] = mySum += smem[tid + 8]; \r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ }\r
+ if (BLOCK_DIM_X == 16) \r
+ {\r
+ if (tid < 8) \r
+ {\r
+ smem[tid] = mySum += smem[tid + 8]; \r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ }\r
+ if (BLOCK_DIM_X == 8) \r
+ {\r
+ if (tid < 4) \r
+ {\r
+ smem[tid] = mySum += smem[tid + 4]; \r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ }\r
+ if (BLOCK_DIM_X == 4) \r
+ {\r
+ if (tid < 2) \r
+ {\r
+ smem[tid] = mySum += smem[tid + 2];\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ }\r
+ if (BLOCK_DIM_X == 2) \r
+ {\r
+ if (tid < 1) \r
+ {\r
+ smem[tid] = mySum += smem[tid + 1]; \r
+ }\r
+ }\r
+ }\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // loadDescsVals\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, typename T> \r
+ __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals)\r
+ {\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ if (tid < desc_len)\r
+ {\r
+ smem[tid] = (float)descs[tid];\r
+ }\r
+ __syncthreads();\r
+\r
+ #pragma unroll\r
+ for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)\r
+ {\r
+ *queryVals = smem[i];\r
+ ++queryVals;\r
+ }\r
+ }\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Distance\r
+\r
+ template <int BLOCK_DIM_X>\r
+ class L1Dist\r
+ {\r
+ public:\r
+ __device__ L1Dist() : mySum(0) {}\r
+\r
+ __device__ void reduceIter(float val1, float val2)\r
+ {\r
+ mySum += fabs(val1 - val2);\r
+ }\r
+\r
+ __device__ void reduceAll(float* sdiff, int tid)\r
+ {\r
+ reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);\r
+ }\r
+\r
+ static __device__ float finalResult(float res)\r
+ {\r
+ return res;\r
+ }\r
+ private:\r
+ float mySum;\r
+ };\r
+\r
+ template <int BLOCK_DIM_X>\r
+ class L2Dist\r
+ {\r
+ public:\r
+ __device__ L2Dist() : mySum(0) {}\r
+\r
+ __device__ void reduceIter(float val1, float val2)\r
+ {\r
+ float reg = val1 - val2;\r
+ mySum += reg * reg;\r
+ }\r
+\r
+ __device__ void reduceAll(float* sdiff, int tid)\r
+ {\r
+ reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);\r
+ }\r
+\r
+ static __device__ float finalResult(float res)\r
+ {\r
+ return sqrtf(res);\r
+ }\r
+ private:\r
+ float mySum;\r
+ };\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // reduceDescDiff\r
+\r
+ template <int BLOCK_DIM_X, typename Dist, typename T> \r
+ __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff)\r
+ {\r
+ const int tid = threadIdx.x;\r
+\r
+ Dist dist;\r
+\r
+ for (int i = tid; i < desc_len; i += BLOCK_DIM_X)\r
+ dist.reduceIter(queryDescs[i], trainDescs[i]);\r
+\r
+ dist.reduceAll(sdiff, tid);\r
+ }\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // reduceDescDiff_smem\r
+\r
+ template <int N> struct UnrollDescDiff\r
+ {\r
+ template <typename Dist, typename T>\r
+ static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, \r
+ int ind, int desc_len)\r
+ {\r
+ if (ind < desc_len)\r
+ dist.reduceIter(*queryVals, trainDescs[ind]);\r
+\r
+ ++queryVals;\r
+\r
+ UnrollDescDiff<N - 1>::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len);\r
+ }\r
+\r
+ template <typename Dist, typename T>\r
+ static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)\r
+ {\r
+ dist.reduceIter(*queryVals, *trainDescs);\r
+\r
+ ++queryVals;\r
+ trainDescs += blockDim.x;\r
+\r
+ UnrollDescDiff<N - 1>::calcWithoutCheck(dist, queryVals, trainDescs);\r
+ }\r
+ };\r
+ template <> struct UnrollDescDiff<0>\r
+ {\r
+ template <typename Dist, typename T>\r
+ static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, \r
+ int ind, int desc_len)\r
+ {\r
+ }\r
+\r
+ template <typename Dist, typename T>\r
+ static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)\r
+ {\r
+ }\r
+ };\r
+\r
+ template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool WITH_OUT_CHECK> struct DescDiffCalculator;\r
+ template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> \r
+ struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>\r
+ {\r
+ template <typename Dist, typename T>\r
+ static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)\r
+ {\r
+ UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(dist, queryVals, trainDescs, \r
+ threadIdx.x, desc_len);\r
+ }\r
+ };\r
+ template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> \r
+ struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>\r
+ {\r
+ template <typename Dist, typename T>\r
+ static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)\r
+ {\r
+ UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(dist, queryVals, \r
+ trainDescs + threadIdx.x);\r
+ }\r
+ };\r
+\r
+ template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>\r
+ __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff)\r
+ {\r
+ const int tid = threadIdx.x;\r
+ \r
+ Dist dist;\r
+\r
+ DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(dist, queryVals, \r
+ trainDescs, desc_len);\r
+ \r
+ dist.reduceAll(sdiff, tid);\r
+ }\r
+\r
+///////////////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////////// Match //////////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // warpReduceMin\r
+\r
+ template <int BLOCK_DIM_Y> \r
+ __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx)\r
+ {\r
+ float minSum = sdata[tid];\r
+\r
+ if (BLOCK_DIM_Y >= 64) \r
+ {\r
+ float reg = sdata[tid + 32];\r
+ if (reg < minSum)\r
+ {\r
+ sdata[tid] = minSum = reg;\r
+ strainIdx[tid] = strainIdx[tid + 32];\r
+ simgIdx[tid] = simgIdx[tid + 32];\r
+ }\r
+ }\r
+ if (BLOCK_DIM_Y >= 32) \r
+ {\r
+ float reg = sdata[tid + 16];\r
+ if (reg < minSum)\r
+ {\r
+ sdata[tid] = minSum = reg;\r
+ strainIdx[tid] = strainIdx[tid + 16];\r
+ simgIdx[tid] = simgIdx[tid + 16];\r
+ }\r
+ }\r
+ if (BLOCK_DIM_Y >= 16) \r
+ {\r
+ float reg = sdata[tid + 8];\r
+ if (reg < minSum)\r
+ {\r
+ sdata[tid] = minSum = reg;\r
+ strainIdx[tid] = strainIdx[tid + 8];\r
+ simgIdx[tid] = simgIdx[tid + 8];\r
+ }\r
+ }\r
+ if (BLOCK_DIM_Y >= 8) \r
+ { \r
+ float reg = sdata[tid + 4];\r
+ if (reg < minSum)\r
+ {\r
+ sdata[tid] = minSum = reg;\r
+ strainIdx[tid] = strainIdx[tid + 4];\r
+ simgIdx[tid] = simgIdx[tid + 4];\r
+ }\r
+ }\r
+ if (BLOCK_DIM_Y >= 4) \r
+ { \r
+ float reg = sdata[tid + 2];\r
+ if (reg < minSum)\r
+ {\r
+ sdata[tid] = minSum = reg;\r
+ strainIdx[tid] = strainIdx[tid + 2];\r
+ simgIdx[tid] = simgIdx[tid + 2];\r
+ }\r
+ }\r
+ if (BLOCK_DIM_Y >= 2) \r
+ {\r
+ float reg = sdata[tid + 1];\r
+ if (reg < minSum)\r
+ {\r
+ sdata[tid] = minSum = reg;\r
+ strainIdx[tid] = strainIdx[tid + 1];\r
+ simgIdx[tid] = simgIdx[tid + 1];\r
+ }\r
+ }\r
+ }\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // findBestMatch\r
+\r
+ template <int BLOCK_DIM_Y, typename Dist>\r
+ __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, \r
+ float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance)\r
+ {\r
+ if (threadIdx.x == 0)\r
+ {\r
+ smin[threadIdx.y] = myMin;\r
+ strainIdx[threadIdx.y] = myBestTrainIdx;\r
+ simgIdx[threadIdx.y] = myBestImgIdx;\r
+ }\r
+ __syncthreads();\r
+\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ if (tid < 32)\r
+ warpReduceMin<BLOCK_DIM_Y>(tid, smin, strainIdx, simgIdx);\r
+\r
+ if (threadIdx.x == 0 && threadIdx.y == 0)\r
+ {\r
+ float minSum = smin[0];\r
+ int bestTrainIdx = strainIdx[0];\r
+ int bestImgIdx = simgIdx[0];\r
+\r
+ imgIdx[queryIdx] = bestImgIdx;\r
+ trainIdx[queryIdx] = bestTrainIdx;\r
+ distance[queryIdx] = Dist::finalResult(minSum);\r
+ }\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // ReduceDescCalculator\r
+\r
+ template <int BLOCK_DIM_X, typename Dist, typename T>\r
+ class ReduceDescCalculatorSimple\r
+ {\r
+ public:\r
+ __device__ void prepare(const T* queryDescs_, int, float*)\r
+ {\r
+ queryDescs = queryDescs_;\r
+ }\r
+\r
+ __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const\r
+ {\r
+ reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, desc_len, sdiff_row);\r
+ }\r
+\r
+ private:\r
+ const T* queryDescs;\r
+ };\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
+ typename Dist, typename T>\r
+ class ReduceDescCalculatorSmem\r
+ {\r
+ public:\r
+ __device__ void prepare(const T* queryDescs, int desc_len, float* smem)\r
+ {\r
+ loadDescsVals<BLOCK_DIM_X, BLOCK_DIM_Y, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, smem, queryVals);\r
+ }\r
+\r
+ __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const\r
+ {\r
+ reduceDescDiff_smem<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist>(queryVals, trainDescs, \r
+ desc_len, sdiff_row);\r
+ }\r
+\r
+ private:\r
+ float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];\r
+ };\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // matchDescs loop\r
+\r
+ template <typename ReduceDescCalculator, typename T, typename Mask>\r
+ __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_<T>& trainDescs_, \r
+ const Mask& m, const ReduceDescCalculator& reduceDescCalc,\r
+ float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx)\r
+ {\r
+ const T* trainDescs = trainDescs_.ptr(threadIdx.y);\r
+ const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T);\r
+ for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; \r
+ trainIdx += blockDim.y, trainDescs += trainDescsStep)\r
+ {\r
+ if (m(queryIdx, trainIdx))\r
+ {\r
+ reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row);\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ float reg = sdiff_row[0];\r
+ if (reg < myMin)\r
+ {\r
+ myMin = reg;\r
+ myBestTrainIdx = trainIdx;\r
+ myBestImgIdx = imgIdx;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Train collection loop strategy\r
+\r
+ template <typename T>\r
+ class SingleTrain\r
+ {\r
+ public:\r
+ explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)\r
+ {\r
+ }\r
+\r
+ template <typename ReduceDescCalculator, typename Mask>\r
+ __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
+ float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const\r
+ {\r
+ matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, \r
+ sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+ }\r
+\r
+ __device__ int desc_len() const\r
+ {\r
+ return trainDescs.cols;\r
+ }\r
+ private:\r
+ DevMem2D_<T> trainDescs;\r
+ };\r
+\r
+ template <typename T>\r
+ class TrainCollection\r
+ {\r
+ public:\r
+ TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) : \r
+ trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)\r
+ {\r
+ }\r
+\r
+ template <typename ReduceDescCalculator, typename Mask>\r
+ __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
+ float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const\r
+ {\r
+ for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)\r
+ {\r
+ DevMem2D_<T> trainDescs = trainCollection[imgIdx];\r
+ m.nextMask();\r
+ matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, \r
+ sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+ }\r
+ }\r
+\r
+ __device__ int desc_len() const\r
+ {\r
+ return desclen;\r
+ }\r
+ private:\r
+ const DevMem2D_<T>* trainCollection;\r
+ int nImg;\r
+ int desclen;\r
+ };\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match kernel\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, \r
+ typename Train, typename Mask>\r
+ __global__ void match(PtrStep_<T> queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance)\r
+ {\r
+ __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+ __shared__ float smin[64];\r
+ __shared__ int strainIdx[64];\r
+ __shared__ int simgIdx[64];\r
+ \r
+ const int queryIdx = blockIdx.x;\r
+ \r
+ int myBestTrainIdx = -1;\r
+ int myBestImgIdx = -1;\r
+ float myMin = numeric_limits_gpu<float>::max();\r
+\r
+ {\r
+ float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+ Mask m = mask;\r
+ ReduceDescCalculator reduceDescCalc;\r
+ reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff);\r
+ \r
+ train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+ }\r
+\r
+ findBestMatch<BLOCK_DIM_Y, Dist>(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, \r
+ smin, strainIdx, simgIdx, trainIdx, imgIdx, distance);\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match kernel callers\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, \r
+ typename Train, typename Mask>\r
+ void match_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
+ const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp\r
+\r
+ dim3 grid(queryDescs.rows, 1, 1);\r
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+ match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, Dist<BLOCK_DIM_X>, T>, \r
+ Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
+ imgIdx.data, distance.data);\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
+ template <int> class Dist, typename T, typename Train, typename Mask>\r
+ void match_smem_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
+ const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp\r
+ StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length\r
+ StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX\r
+\r
+ dim3 grid(queryDescs.rows, 1, 1);\r
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+ match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSmem<BLOCK_DIM_X, BLOCK_DIM_Y, \r
+ MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist<BLOCK_DIM_X>, T>, \r
+ Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
+ imgIdx.data, distance.data);\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Match kernel chooser\r
+\r
+ template <template <int> class Dist, typename T, typename Train, typename Mask>\r
+ void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, \r
+ const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ if (queryDescs.cols < 64)\r
+ match_smem_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ else if (queryDescs.cols == 64)\r
+ match_smem_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ else if (queryDescs.cols < 128)\r
+ match_smem_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ else if (queryDescs.cols == 128)\r
+ match_smem_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ else if (queryDescs.cols < 256)\r
+ match_smem_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ else if (queryDescs.cols == 256)\r
+ match_smem_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ else\r
+ match_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+\r
+ template <typename T>\r
+ void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
+ if (mask.data)\r
+ {\r
+ SingleMask m(mask);\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);\r
+ }\r
+ else\r
+ {\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ }\r
+ }\r
+\r
+ template void matchSingleL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+ template <typename T>\r
+ void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, \r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ SingleTrain<T> train((DevMem2D_<T>)trainDescs);\r
+ if (mask.data)\r
+ {\r
+ SingleMask m(mask);\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);\r
+ }\r
+ else\r
+ {\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ }\r
+ }\r
+\r
+ template void matchSingleL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+ template <typename T>\r
+ void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
+ if (maskCollection.data)\r
+ {\r
+ MaskCollection mask(maskCollection.data);\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ }\r
+ else\r
+ {\r
+ match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ }\r
+ }\r
+\r
+ template void matchCollectionL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+\r
+ template <typename T>\r
+ void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, \r
+ const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
+ {\r
+ TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);\r
+ if (maskCollection.data)\r
+ {\r
+ MaskCollection mask(maskCollection.data);\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+ }\r
+ else\r
+ {\r
+ match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);\r
+ }\r
+ }\r
+\r
+ template void matchCollectionL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);\r
+ \r
+///////////////////////////////////////////////////////////////////////////////////\r
+//////////////////////////////////// Knn Match ////////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Calc distance kernel\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
+ __global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)\r
+ {\r
+ __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+\r
+ float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+ \r
+ const int queryIdx = blockIdx.x;\r
+ const T* queryDescs = queryDescs_.ptr(queryIdx);\r
+\r
+ const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+\r
+ if (trainIdx < trainDescs_.rows)\r
+ {\r
+ const T* trainDescs = trainDescs_.ptr(trainIdx);\r
+\r
+ float dist = numeric_limits_gpu<float>::max();\r
+\r
+ if (mask(queryIdx, trainIdx))\r
+ {\r
+ reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ dist = Dist::finalResult(sdiff_row[0]);\r
+ }\r
+ }\r
+ \r
+ if (threadIdx.x == 0)\r
+ distance.ptr(queryIdx)[trainIdx] = dist;\r
+ }\r
+ }\r
+\r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Calc distance kernel caller\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>\r
+ void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
+ const Mask& mask, const DevMem2Df& distance)\r
+ {\r
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+ dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);\r
+\r
+ calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(\r
+ queryDescs, trainDescs, mask, distance);\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // reduceMin\r
+\r
+ template <int BLOCK_SIZE> \r
+ __device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)\r
+ {\r
+ if (tid < 32)\r
+ {\r
+ if (BLOCK_SIZE >= 64) \r
+ { \r
+ float reg = sdist[tid + 32];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 32];\r
+ }\r
+ }\r
+ if (BLOCK_SIZE >= 32) \r
+ { \r
+ float reg = sdist[tid + 16];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 16];\r
+ }\r
+ }\r
+ if (BLOCK_SIZE >= 16) \r
+ { \r
+ float reg = sdist[tid + 8];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 8];\r
+ }\r
+ }\r
+ if (BLOCK_SIZE >= 8) \r
+ { \r
+ float reg = sdist[tid + 4];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 4];\r
+ }\r
+ }\r
+ if (BLOCK_SIZE >= 4) \r
+ { \r
+ float reg = sdist[tid + 2];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 2];\r
+ } \r
+ }\r
+ if (BLOCK_SIZE >= 2) \r
+ { \r
+ float reg = sdist[tid + 1];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 1];\r
+ }\r
+ }\r
+ }\r
+ }\r
+ \r
+ template <int BLOCK_SIZE> \r
+ __device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx)\r
+ {\r
+ const int tid = threadIdx.x;\r
+ \r
+ float myMin = numeric_limits_gpu<float>::max();\r
+ int myMinIdx = -1;\r
+\r
+ for (int i = tid; i < n; i += BLOCK_SIZE)\r
+ {\r
+ float reg = dist[i];\r
+ if (reg < myMin)\r
+ {\r
+ myMin = reg;\r
+ myMinIdx = i;\r
+ }\r
+ }\r
+\r
+ sdist[tid] = myMin;\r
+ strainIdx[tid] = myMinIdx;\r
+ __syncthreads();\r
+\r
+ if (BLOCK_SIZE >= 512 && tid < 256) \r
+ {\r
+ float reg = sdist[tid + 256];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 256];\r
+ }\r
+ __syncthreads(); \r
+ }\r
+ if (BLOCK_SIZE >= 256 && tid < 128) \r
+ {\r
+ float reg = sdist[tid + 128];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 128];\r
+ }\r
+ __syncthreads(); \r
+ }\r
+ if (BLOCK_SIZE >= 128 && tid < 64) \r
+ {\r
+ float reg = sdist[tid + 64];\r
+\r
+ if (reg < myMin)\r
+ {\r
+ sdist[tid] = myMin = reg;\r
+ strainIdx[tid] = strainIdx[tid + 64];\r
+ }\r
+ __syncthreads(); \r
+ }\r
+ \r
+ warpReduceMinIdx<BLOCK_SIZE>(sdist, strainIdx, myMin, tid);\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // find knn match kernel\r
+\r
+ template <int BLOCK_SIZE>\r
+ __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)\r
+ {\r
+ const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
+ __shared__ float sdist[SMEM_SIZE];\r
+ __shared__ int strainIdx[SMEM_SIZE];\r
+\r
+ const int queryIdx = blockIdx.x;\r
+\r
+ float* allDist = allDist_.ptr(queryIdx);\r
+ int* trainIdx = trainIdx_.ptr(queryIdx);\r
+ float* distance = distance_.ptr(queryIdx);\r
+\r
+ reduceMinIdx<BLOCK_SIZE>(allDist, allDist_.cols, sdist, strainIdx);\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ float dist = sdist[0];\r
+ if (dist < numeric_limits_gpu<float>::max())\r
+ {\r
+ int bestIdx = strainIdx[0];\r
+ allDist[bestIdx] = numeric_limits_gpu<float>::max();\r
+ trainIdx[i] = bestIdx;\r
+ distance[i] = dist;\r
+ }\r
+ }\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // find knn match kernel caller\r
+\r
+ template <int BLOCK_SIZE>\r
+ void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+ {\r
+ dim3 threads(BLOCK_SIZE, 1, 1);\r
+ dim3 grid(trainIdx.rows, 1, 1);\r
+\r
+ for (int i = 0; i < knn; ++i)\r
+ findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);\r
+ \r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // knn match caller\r
+\r
+ template <typename T>\r
+ void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ SingleMask(mask), allDist);\r
+ }\r
+ else\r
+ {\r
+ calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ WithOutMask(), allDist);\r
+ }\r
+\r
+ findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
+ }\r
+\r
+ template void knnMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+ template <typename T>\r
+ void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ SingleMask(mask), allDist);\r
+ }\r
+ else\r
+ {\r
+ calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ WithOutMask(), allDist);\r
+ }\r
+\r
+ findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);\r
+ }\r
+\r
+ template void knnMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+ template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);\r
+\r
+///////////////////////////////////////////////////////////////////////////////////\r
+/////////////////////////////////// Radius Match //////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Radius Match kernel\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
+ __global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, \r
+ float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)\r
+ {\r
+ __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+\r
+ float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+ \r
+ const int queryIdx = blockIdx.x;\r
+ const T* queryDescs = queryDescs_.ptr(queryIdx);\r
+\r
+ const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+ if (trainIdx < trainDescs_.rows)\r
+ {\r
+ const T* trainDescs = trainDescs_.ptr(trainIdx);\r
+\r
+ if (mask(queryIdx, trainIdx))\r
+ {\r
+ reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ float dist = Dist::finalResult(sdiff_row[0]);\r
+ if (dist < maxDistance)\r
+ {\r
+ unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
+ if (i < trainIdx_.cols)\r
+ {\r
+ distance.ptr(queryIdx)[i] = dist;\r
+ trainIdx_.ptr(queryIdx)[i] = trainIdx;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Radius Match kernel caller\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>\r
+ void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
+ float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, \r
+ const DevMem2Df& distance)\r
+ {\r
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+ dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);\r
+\r
+ radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(\r
+ queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);\r
+\r
+ cudaSafeCall( cudaThreadSynchronize() );\r
+ }\r
+ \r
+ ///////////////////////////////////////////////////////////////////////////////\r
+ // Radius Match kernel chooser\r
+\r
+ template <typename T>\r
+ void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ maxDistance, SingleMask(mask), trainIdx, nMatches, distance);\r
+ }\r
+ else\r
+ {\r
+ radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ maxDistance, WithOutMask(), trainIdx, nMatches, distance);\r
+ }\r
+ }\r
+\r
+ template void radiusMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+\r
+ template <typename T>\r
+ void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)\r
+ {\r
+ if (mask.data)\r
+ {\r
+ radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ maxDistance, SingleMask(mask), trainIdx, nMatches, distance);\r
+ }\r
+ else\r
+ {\r
+ radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
+ maxDistance, WithOutMask(), trainIdx, nMatches, distance);\r
+ }\r
+ }\r
+\r
+ template void radiusMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+ template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
+}}}\r