From 8891acb67a12c50d42750b65ff32b1aa0cb26d4e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 6 Dec 2010 12:06:51 +0000 Subject: [PATCH] added BruteForceMatcher_GPU --- modules/gpu/CMakeLists.txt | 2 +- modules/gpu/include/opencv2/gpu/gpu.hpp | 148 +++- modules/gpu/src/brute_force_matcher.cpp | 605 ++++++++++++++ modules/gpu/src/cuda/brute_force_matcher.cu | 1205 +++++++++++++++++++++++++++ tests/gpu/src/brute_force_matcher.cpp | 175 ++++ tests/gpu/src/gputest.hpp | 3 +- 6 files changed, 2135 insertions(+), 3 deletions(-) create mode 100644 modules/gpu/src/brute_force_matcher.cpp create mode 100644 modules/gpu/src/cuda/brute_force_matcher.cu create mode 100644 tests/gpu/src/brute_force_matcher.cpp diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 8181c3d..f41817b 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -1,6 +1,6 @@ set(name "gpu") -set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect") +set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann") set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index a8d3594..53a1a74 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -48,6 +48,7 @@ #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/objdetect/objdetect.hpp" #include "opencv2/gpu/devmem2d.hpp" +#include "opencv2/features2d/features2d.hpp" namespace cv { @@ -1118,7 +1119,152 @@ namespace cv // Gradients conputation results GpuMat grad, qangle; - }; + }; + + + ////////////////////////////////// BruteForceMatcher ////////////////////////////////// + + class CV_EXPORTS BruteForceMatcher_GPU_base + { + public: + enum DistType {L1Dist = 0, L2Dist}; + + explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist); + + // Add descriptors to train descriptor collection. + void add(const std::vector& descCollection); + + // Get train descriptors collection. + const std::vector& getTrainDescriptors() const; + + // Clear train descriptors collection. + void clear(); + + // Return true if there are not train descriptors in collection. + bool empty() const; + + // Return true if the matcher supports mask in match methods. + bool isMaskSupported() const; + + // Find one best match for each query descriptor. + // trainIdx.at(0, queryIdx) will contain best train index for queryIdx + // distance.at(0, queryIdx) will contain distance + void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& distance, + const GpuMat& mask = GpuMat()); + + // Download trainIdx and distance to CPU vector with DMatch + static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector& matches); + + // Find one best match for each query descriptor. + void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector& matches, + const GpuMat& mask = GpuMat()); + + // Make gpu collection of trains and masks in suitable format for matchCollection function + void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, + const vector& masks = std::vector()); + + // Find one best match from train collection for each query descriptor. + // trainIdx.at(0, queryIdx) will contain best train index for queryIdx + // imgIdx.at(0, queryIdx) will contain best image index for queryIdx + // distance.at(0, queryIdx) will contain distance + void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, + const GpuMat& maskCollection); + + // Download trainIdx, imgIdx and distance to CPU vector with DMatch + static void matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, const GpuMat& distance, + std::vector& matches); + + // Find one best match from train collection for each query descriptor. + void match(const GpuMat& queryDescs, std::vector& matches, + const std::vector& masks = std::vector()); + + // Find k best matches for each query descriptor (in increasing order of distances). + // trainIdx.at(queryIdx, i) will contain index of i'th best trains (i < k). + // distance.at(queryIdx, i) will contain distance. + // allDist is a buffer to store all distance between query descriptors and train descriptors + // it have size (nQuery,nTrain) and CV_32F type + // allDist.at(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best, + // otherwise it will contain distance between queryIdx and trainIdx descriptors + void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat()); + + // Download trainIdx and distance to CPU vector with DMatch + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + + // Find k best matches for each query descriptor (in increasing order of distances). + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + std::vector< std::vector >& matches, int k, const GpuMat& mask = GpuMat(), + bool compactResult = false); + + // Find k best matches for each query descriptor (in increasing order of distances). + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + void knnMatch(const GpuMat& queryDescs, std::vector< std::vector >& matches, int knn, + const std::vector& masks = std::vector(), bool compactResult = false ); + + // Find best matches for each query descriptor which have distance less than maxDistance. + // nMatches.at(0, queruIdx) will contain matches count for queryIdx. + // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, + // because it didn't have enough memory. + // trainIdx.at(queruIdx, i) will contain ith train index (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) + // distance.at(queruIdx, i) will contain ith distance (i < min(nMatches.at(0, queruIdx), trainIdx.cols)) + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x nTrain, + // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches + // Matches doesn't sorted. + void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, + const GpuMat& mask = GpuMat()); + + // Download trainIdx, nMatches and distance to CPU vector with DMatch. + // matches will be sorted in increasing order of distances. + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance + // in increasing order of distances). + void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + std::vector< std::vector >& matches, float maxDistance, + const GpuMat& mask = GpuMat(), bool compactResult = false); + + // Find best matches from train collection for each query descriptor which have distance less than + // maxDistance (in increasing order of distances). + void radiusMatch(const GpuMat& queryDescs, std::vector< std::vector >& matches, float maxDistance, + const std::vector& masks = std::vector(), bool compactResult = false); + + private: + DistType distType; + + std::vector trainDescCollection; + }; + + template + class CV_EXPORTS BruteForceMatcher_GPU; + + template + class CV_EXPORTS BruteForceMatcher_GPU< L1 > : public BruteForceMatcher_GPU_base + { + public: + explicit BruteForceMatcher_GPU(L1 d = L1()) : BruteForceMatcher_GPU_base(L1Dist) {} + }; + template + class CV_EXPORTS BruteForceMatcher_GPU< L2 > : public BruteForceMatcher_GPU_base + { + public: + explicit BruteForceMatcher_GPU(L2 d = L2()) : BruteForceMatcher_GPU_base(L2Dist) {} + }; } diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp new file mode 100644 index 0000000..1dd3c0a --- /dev/null +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -0,0 +1,605 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::gpu; +using namespace std; + +#if !defined (HAVE_CUDA) + +cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::add(const vector&) { throw_nogpu(); } +const vector& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; } +void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); } +bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; } +bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; } +void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector&, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, GpuMat&, const GpuMat&, std::vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector&, const std::vector&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, int, const GpuMat&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector >&, int, const std::vector&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector >&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector >&, float, const GpuMat&, bool) { throw_nogpu(); } +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector >&, float, const std::vector&, bool) { throw_nogpu(); } + +#else /* !defined (HAVE_CUDA) */ + +namespace cv { namespace gpu { namespace bfmatcher +{ + template + void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + template + void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + template + void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, + const DevMem2Df& distance); + template + void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, + const DevMem2Df& distance); + + template + void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template + void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + + template + void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template + void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); +}}} + +cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) +{ +} + +//////////////////////////////////////////////////////////////////// +// Train collection + +void cv::gpu::BruteForceMatcher_GPU_base::add(const vector& descCollection) +{ + trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end()); +} + +const vector& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const +{ + return trainDescCollection; +} + +void cv::gpu::BruteForceMatcher_GPU_base::clear() +{ + trainDescCollection.clear(); +} + +bool cv::gpu::BruteForceMatcher_GPU_base::empty() const +{ + return trainDescCollection.empty(); +} + +bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const +{ + return true; +} + +//////////////////////////////////////////////////////////////////// +// Match + +void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask) +{ + using namespace cv::gpu::bfmatcher; + + typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + + static const match_caller_t match_callers[2][8] = + { + { + matchSingleL1_gpu, matchSingleL1_gpu, matchSingleL1_gpu, + matchSingleL1_gpu, matchSingleL1_gpu, matchSingleL1_gpu, 0, 0 + }, + { + matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, + matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, 0, 0 + } + }; + + CV_Assert(queryDescs.channels() == 1); + CV_Assert(trainDescs.cols == queryDescs.cols && trainDescs.type() == queryDescs.type()); + + const int nQuery = queryDescs.rows; + + trainIdx.create(1, nQuery, CV_32S); + distance.create(1, nQuery, CV_32F); + + match_caller_t func = match_callers[distType][queryDescs.depth()]; + CV_Assert(func != 0); + + // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. + // trainIdx store after imgIdx, so we doesn't lose it value. + func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance); +} + +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, + vector& matches) +{ + const int nQuery = trainIdx.cols; + + Mat trainIdxCPU = trainIdx; + Mat distanceCPU = distance; + + matches.clear(); + matches.reserve(nQuery); + + const int* trainIdx_ptr = trainIdxCPU.ptr(); + const float* distance_ptr = distanceCPU.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr) + { + int trainIdx = *trainIdx_ptr; + if (trainIdx == -1) + continue; + + float distance = *distance_ptr; + + DMatch m(queryIdx, trainIdx, 0, distance); + + matches.push_back(m); + } +} + +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, const GpuMat& trainDescs, + vector& matches, const GpuMat& mask) +{ + GpuMat trainIdx, distance; + matchSingle(queryDescs, trainDescs, trainIdx, distance, mask); + matchDownload(trainIdx, distance, matches); +} + +void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, + const vector& masks) +{ + if (masks.empty()) + { + Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D))); + + for (size_t i = 0; i < trainDescCollection.size(); ++i) + { + const GpuMat& trainDescs = trainDescCollection[i]; + + trainCollectionCPU.ptr(0)[i] = trainDescs; + } + + trainCollection.upload(trainCollectionCPU); + } + else + { + CV_Assert(masks.size() == trainDescCollection.size()); + + Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D))); + Mat maskCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(PtrStep))); + + for (size_t i = 0; i < trainDescCollection.size(); ++i) + { + const GpuMat& trainDescs = trainDescCollection[i]; + const GpuMat& mask = masks[i]; + + CV_Assert(mask.empty() || (mask.type() == CV_8UC1)); + + trainCollectionCPU.ptr(0)[i] = trainDescs; + + maskCollectionCPU.ptr(0)[i] = static_cast(mask); + } + + trainCollection.upload(trainCollectionCPU); + maskCollection.upload(maskCollectionCPU); + } +} + +void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection) +{ + using namespace cv::gpu::bfmatcher; + + typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, + const DevMem2Df& distance); + + static const match_caller_t match_callers[2][8] = + { + { + matchCollectionL1_gpu, matchCollectionL1_gpu, + matchCollectionL1_gpu, matchCollectionL1_gpu, + matchCollectionL1_gpu, matchCollectionL1_gpu, 0, 0 + }, + { + matchCollectionL2_gpu, matchCollectionL2_gpu, + matchCollectionL2_gpu, matchCollectionL2_gpu, + matchCollectionL2_gpu, matchCollectionL2_gpu, 0, 0 + } + }; + + CV_Assert(queryDescs.channels() == 1); + + const int nQuery = queryDescs.rows; + + trainIdx.create(1, nQuery, CV_32S); + imgIdx.create(1, nQuery, CV_32S); + distance.create(1, nQuery, CV_32F); + + match_caller_t func = match_callers[distType][queryDescs.depth()]; + CV_Assert(func != 0); + + func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance); +} + +void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, + const GpuMat& distance, vector& matches) +{ + const int nQuery = trainIdx.cols; + + Mat trainIdxCPU = trainIdx; + Mat imgIdxCPU = imgIdx; + Mat distanceCPU = distance; + + matches.clear(); + matches.reserve(nQuery); + + const int* trainIdx_ptr = trainIdxCPU.ptr(); + const int* imgIdx_ptr = imgIdxCPU.ptr(); + const float* distance_ptr = distanceCPU.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr) + { + int trainIdx = *trainIdx_ptr; + if (trainIdx == -1) + continue; + + int imgIdx = *imgIdx_ptr; + + float distance = *distance_ptr; + + DMatch m(queryIdx, trainIdx, imgIdx, distance); + + matches.push_back(m); + } +} + +void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector& matches, + const vector& masks) +{ + GpuMat trainCollection; + GpuMat maskCollection; + + makeGpuCollection(trainCollection, maskCollection, masks); + + GpuMat trainIdx, imgIdx, distance; + + matchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, maskCollection); + matchDownload(trainIdx, imgIdx, distance, matches); +} + +//////////////////////////////////////////////////////////////////// +// KnnMatch + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask) +{ + using namespace cv::gpu::bfmatcher; + + typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + + static const match_caller_t match_callers[2][8] = + { + { + knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, + knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, 0, 0 + }, + { + knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, + knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, 0, 0 + } + }; + + CV_Assert(queryDescs.channels() == 1); + + const int nQuery = queryDescs.rows; + const int nTrain = trainDescs.rows; + + trainIdx.create(nQuery, k, CV_32S); + trainIdx.setTo(Scalar::all(-1)); + distance.create(nQuery, k, CV_32F); + + allDist.create(nQuery, nTrain, CV_32F); + + match_caller_t func = match_callers[distType][queryDescs.depth()]; + CV_Assert(func != 0); + + func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist); +} + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, + vector< vector >& matches, bool compactResult) +{ + const int nQuery = distance.rows; + const int k = trainIdx.cols; + + Mat trainIdxCPU = trainIdx; + Mat distanceCPU = distance; + + matches.clear(); + matches.reserve(nQuery); + + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) + { + matches.push_back(vector()); + vector& curMatches = matches.back(); + curMatches.reserve(k); + + int* trainIdx_ptr = trainIdxCPU.ptr(queryIdx); + float* distance_ptr = distanceCPU.ptr(queryIdx); + for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr) + { + int trainIdx = *trainIdx_ptr; + + if (trainIdx != -1) + { + float distance = *distance_ptr; + + DMatch m(queryIdx, trainIdx, 0, distance); + + curMatches.push_back(m); + } + } + + if (compactResult && curMatches.empty()) + matches.pop_back(); + } +} + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + vector< vector >& matches, int k, const GpuMat& mask, bool compactResult) +{ + GpuMat trainIdx, distance, allDist; + knnMatch(queryDescs, trainDescs, trainIdx, distance, allDist, k, mask); + knnMatchDownload(trainIdx, distance, matches, compactResult); +} + +namespace +{ + class ImgIdxSetter + { + public: + ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} + void operator()(DMatch& m) const {m.imgIdx = imgIdx;} + private: + int imgIdx; + }; +} + +void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, + vector< vector >& matches, int knn, const vector& masks, bool compactResult) +{ + vector< vector > curMatches; + vector temp; + temp.reserve(2 * knn); + + matches.resize(queryDescs.rows); + for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector::reserve), knn)); + + for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx) + { + knnMatch(queryDescs, trainDescCollection[imgIdx], curMatches, knn, + masks.empty() ? GpuMat() : masks[imgIdx]); + + for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx) + { + vector& localMatch = curMatches[queryIdx]; + vector& globalMatch = matches[queryIdx]; + + for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx)); + + temp.clear(); + merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp)); + + globalMatch.clear(); + const size_t count = std::min((size_t)knn, temp.size()); + copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch)); + } + } + + if (compactResult) + { + vector< vector >::iterator new_end = remove_if(matches.begin(), matches.end(), + mem_fun_ref(&vector::empty)); + matches.erase(new_end, matches.end()); + } +} + +//////////////////////////////////////////////////////////////////// +// RadiusMatch + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask) +{ + using namespace cv::gpu::bfmatcher; + + typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + + static const radiusMatch_caller_t radiusMatch_callers[2][8] = + { + { + radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, + radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, 0, 0 + }, + { + radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, + radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, 0, 0 + } + }; + + const int nQuery = queryDescs.rows; + const int nTrain = trainDescs.rows; + + CV_Assert(queryDescs.channels() == 1); + CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols); + CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery); + + nMatches.create(1, nQuery, CV_32SC1); + nMatches.setTo(Scalar::all(0)); + if (trainIdx.empty()) + { + trainIdx.create(nQuery, nTrain, CV_32SC1); + distance.create(nQuery, nTrain, CV_32FC1); + } + + radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; + CV_Assert(func != 0); + + func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr(), distance); +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, + const GpuMat& distance, std::vector< std::vector >& matches, bool compactResult) +{ + const int nQuery = trainIdx.rows; + + Mat trainIdxCPU = trainIdx; + Mat nMatchesCPU = nMatches; + Mat distanceCPU = distance; + + matches.clear(); + matches.reserve(nQuery); + + const unsigned int* nMatches_ptr = nMatchesCPU.ptr(); + for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx) + { + const int* trainIdx_ptr = trainIdxCPU.ptr(queryIdx); + const float* distance_ptr = distanceCPU.ptr(queryIdx); + + const int nMatches = std::min(static_cast(nMatches_ptr[queryIdx]), trainIdx.cols); + + if (nMatches == 0) + { + if (!compactResult) + matches.push_back(vector()); + continue; + } + + matches.push_back(vector()); + vector& curMatches = matches.back(); + curMatches.reserve(nMatches); + + for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr) + { + int trainIdx = *trainIdx_ptr; + + float distance = *distance_ptr; + + DMatch m(queryIdx, trainIdx, 0, distance); + + curMatches.push_back(m); + } + sort(curMatches.begin(), curMatches.end()); + } +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, + vector< vector >& matches, float maxDistance, const GpuMat& mask, bool compactResult) +{ + GpuMat trainIdx, nMatches, distance; + radiusMatch(queryDescs, trainDescs, trainIdx, nMatches, distance, maxDistance, mask); + radiusMatchDownload(trainIdx, nMatches, distance, matches, compactResult); +} + +void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector >& matches, + float maxDistance, const vector& masks, bool compactResult) + +{ + matches.resize(queryDescs.rows); + + vector< vector > curMatches; + + for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx) + { + radiusMatch(queryDescs, trainDescCollection[imgIdx], curMatches, maxDistance, + masks.empty() ? GpuMat() : masks[imgIdx]); + + for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx) + { + vector& localMatch = curMatches[queryIdx]; + vector& globalMatch = matches[queryIdx]; + + for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx)); + + const size_t oldSize = globalMatch.size(); + + copy(localMatch.begin(), localMatch.end(), back_inserter(globalMatch)); + inplace_merge(globalMatch.begin(), globalMatch.begin() + oldSize, globalMatch.end()); + } + } + + if (compactResult) + { + vector< vector >::iterator new_end = remove_if(matches.begin(), matches.end(), + mem_fun_ref(&vector::empty)); + matches.erase(new_end, matches.end()); + } +} + +#endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu new file mode 100644 index 0000000..c2e2c1c --- /dev/null +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -0,0 +1,1205 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "cuda_shared.hpp" +#include "limits_gpu.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace cv { namespace gpu { namespace bfmatcher +{ +/////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////// General funcs ////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// + + template struct StaticAssert; + template <> struct StaticAssert {static __host__ __device__ void check(){}}; + + /////////////////////////////////////////////////////////////////////////////// + // Mask strategy + + class SingleMask + { + public: + explicit SingleMask(const PtrStep& mask_) : mask(mask_) {} + + __device__ bool operator()(int queryIdx, int trainIdx) const + { + return mask.ptr(queryIdx)[trainIdx] != 0; + } + private: + PtrStep mask; + }; + + class MaskCollection + { + public: + explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {} + + __device__ void nextMask() + { + curMask = *maskCollection++; + } + + __device__ bool operator()(int queryIdx, int trainIdx) const + { + return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0; + } + private: + PtrStep* maskCollection; + PtrStep curMask; + }; + + class WithOutMask + { + public: + __device__ void nextMask() + { + } + __device__ bool operator()(int queryIdx, int trainIdx) const + { + return true; + } + }; + + /////////////////////////////////////////////////////////////////////////////// + // Reduce Sum + + template + __device__ void reduceSum(float* sdiff, float mySum, int tid) + { + sdiff[tid] = mySum; + __syncthreads(); + + if (BLOCK_DIM_X == 512) + { + if (tid < 256) + { + sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); + sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); + sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); + } + volatile float* smem = sdiff; + smem[tid] = mySum += smem[tid + 32]; + smem[tid] = mySum += smem[tid + 16]; + smem[tid] = mySum += smem[tid + 8]; + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + if (BLOCK_DIM_X == 256) + { + if (tid < 128) + { + sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); + sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); + } + volatile float* smem = sdiff; + smem[tid] = mySum += smem[tid + 32]; + smem[tid] = mySum += smem[tid + 16]; + smem[tid] = mySum += smem[tid + 8]; + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + if (BLOCK_DIM_X == 128) + { + if (tid < 64) + { + sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); + } + volatile float* smem = sdiff; + smem[tid] = mySum += smem[tid + 32]; + smem[tid] = mySum += smem[tid + 16]; + smem[tid] = mySum += smem[tid + 8]; + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + + volatile float* smem = sdiff; + if (BLOCK_DIM_X == 64) + { + if (tid < 32) + { + smem[tid] = mySum += smem[tid + 32]; + smem[tid] = mySum += smem[tid + 16]; + smem[tid] = mySum += smem[tid + 8]; + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + } + if (BLOCK_DIM_X == 32) + { + if (tid < 16) + { + smem[tid] = mySum += smem[tid + 16]; + smem[tid] = mySum += smem[tid + 8]; + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + } + if (BLOCK_DIM_X == 16) + { + if (tid < 8) + { + smem[tid] = mySum += smem[tid + 8]; + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + } + if (BLOCK_DIM_X == 8) + { + if (tid < 4) + { + smem[tid] = mySum += smem[tid + 4]; + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + } + if (BLOCK_DIM_X == 4) + { + if (tid < 2) + { + smem[tid] = mySum += smem[tid + 2]; + smem[tid] = mySum += smem[tid + 1]; + } + } + if (BLOCK_DIM_X == 2) + { + if (tid < 1) + { + smem[tid] = mySum += smem[tid + 1]; + } + } + } + + /////////////////////////////////////////////////////////////////////////////// + // loadDescsVals + + template + __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals) + { + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < desc_len) + { + smem[tid] = (float)descs[tid]; + } + __syncthreads(); + + #pragma unroll + for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X) + { + *queryVals = smem[i]; + ++queryVals; + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Distance + + template + class L1Dist + { + public: + __device__ L1Dist() : mySum(0) {} + + __device__ void reduceIter(float val1, float val2) + { + mySum += fabs(val1 - val2); + } + + __device__ void reduceAll(float* sdiff, int tid) + { + reduceSum(sdiff, mySum, tid); + } + + static __device__ float finalResult(float res) + { + return res; + } + private: + float mySum; + }; + + template + class L2Dist + { + public: + __device__ L2Dist() : mySum(0) {} + + __device__ void reduceIter(float val1, float val2) + { + float reg = val1 - val2; + mySum += reg * reg; + } + + __device__ void reduceAll(float* sdiff, int tid) + { + reduceSum(sdiff, mySum, tid); + } + + static __device__ float finalResult(float res) + { + return sqrtf(res); + } + private: + float mySum; + }; + + /////////////////////////////////////////////////////////////////////////////// + // reduceDescDiff + + template + __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff) + { + const int tid = threadIdx.x; + + Dist dist; + + for (int i = tid; i < desc_len; i += BLOCK_DIM_X) + dist.reduceIter(queryDescs[i], trainDescs[i]); + + dist.reduceAll(sdiff, tid); + } + + /////////////////////////////////////////////////////////////////////////////// + // reduceDescDiff_smem + + template struct UnrollDescDiff + { + template + static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, + int ind, int desc_len) + { + if (ind < desc_len) + dist.reduceIter(*queryVals, trainDescs[ind]); + + ++queryVals; + + UnrollDescDiff::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len); + } + + template + static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs) + { + dist.reduceIter(*queryVals, *trainDescs); + + ++queryVals; + trainDescs += blockDim.x; + + UnrollDescDiff::calcWithoutCheck(dist, queryVals, trainDescs); + } + }; + template <> struct UnrollDescDiff<0> + { + template + static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, + int ind, int desc_len) + { + } + + template + static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs) + { + } + }; + + template struct DescDiffCalculator; + template + struct DescDiffCalculator + { + template + static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len) + { + UnrollDescDiff::calcCheck(dist, queryVals, trainDescs, + threadIdx.x, desc_len); + } + }; + template + struct DescDiffCalculator + { + template + static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len) + { + UnrollDescDiff::calcWithoutCheck(dist, queryVals, + trainDescs + threadIdx.x); + } + }; + + template + __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff) + { + const int tid = threadIdx.x; + + Dist dist; + + DescDiffCalculator::calc(dist, queryVals, + trainDescs, desc_len); + + dist.reduceAll(sdiff, tid); + } + +/////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////// Match ////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////////////////////////////////////////////////// + // warpReduceMin + + template + __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx) + { + float minSum = sdata[tid]; + + if (BLOCK_DIM_Y >= 64) + { + float reg = sdata[tid + 32]; + if (reg < minSum) + { + sdata[tid] = minSum = reg; + strainIdx[tid] = strainIdx[tid + 32]; + simgIdx[tid] = simgIdx[tid + 32]; + } + } + if (BLOCK_DIM_Y >= 32) + { + float reg = sdata[tid + 16]; + if (reg < minSum) + { + sdata[tid] = minSum = reg; + strainIdx[tid] = strainIdx[tid + 16]; + simgIdx[tid] = simgIdx[tid + 16]; + } + } + if (BLOCK_DIM_Y >= 16) + { + float reg = sdata[tid + 8]; + if (reg < minSum) + { + sdata[tid] = minSum = reg; + strainIdx[tid] = strainIdx[tid + 8]; + simgIdx[tid] = simgIdx[tid + 8]; + } + } + if (BLOCK_DIM_Y >= 8) + { + float reg = sdata[tid + 4]; + if (reg < minSum) + { + sdata[tid] = minSum = reg; + strainIdx[tid] = strainIdx[tid + 4]; + simgIdx[tid] = simgIdx[tid + 4]; + } + } + if (BLOCK_DIM_Y >= 4) + { + float reg = sdata[tid + 2]; + if (reg < minSum) + { + sdata[tid] = minSum = reg; + strainIdx[tid] = strainIdx[tid + 2]; + simgIdx[tid] = simgIdx[tid + 2]; + } + } + if (BLOCK_DIM_Y >= 2) + { + float reg = sdata[tid + 1]; + if (reg < minSum) + { + sdata[tid] = minSum = reg; + strainIdx[tid] = strainIdx[tid + 1]; + simgIdx[tid] = simgIdx[tid + 1]; + } + } + } + + /////////////////////////////////////////////////////////////////////////////// + // findBestMatch + + template + __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, + float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance) + { + if (threadIdx.x == 0) + { + smin[threadIdx.y] = myMin; + strainIdx[threadIdx.y] = myBestTrainIdx; + simgIdx[threadIdx.y] = myBestImgIdx; + } + __syncthreads(); + + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < 32) + warpReduceMin(tid, smin, strainIdx, simgIdx); + + if (threadIdx.x == 0 && threadIdx.y == 0) + { + float minSum = smin[0]; + int bestTrainIdx = strainIdx[0]; + int bestImgIdx = simgIdx[0]; + + imgIdx[queryIdx] = bestImgIdx; + trainIdx[queryIdx] = bestTrainIdx; + distance[queryIdx] = Dist::finalResult(minSum); + } + } + + /////////////////////////////////////////////////////////////////////////////// + // ReduceDescCalculator + + template + class ReduceDescCalculatorSimple + { + public: + __device__ void prepare(const T* queryDescs_, int, float*) + { + queryDescs = queryDescs_; + } + + __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const + { + reduceDescDiff(queryDescs, trainDescs, desc_len, sdiff_row); + } + + private: + const T* queryDescs; + }; + + template + class ReduceDescCalculatorSmem + { + public: + __device__ void prepare(const T* queryDescs, int desc_len, float* smem) + { + loadDescsVals(queryDescs, desc_len, smem, queryVals); + } + + __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const + { + reduceDescDiff_smem(queryVals, trainDescs, + desc_len, sdiff_row); + } + + private: + float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X]; + }; + + /////////////////////////////////////////////////////////////////////////////// + // matchDescs loop + + template + __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_& trainDescs_, + const Mask& m, const ReduceDescCalculator& reduceDescCalc, + float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) + { + const T* trainDescs = trainDescs_.ptr(threadIdx.y); + const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T); + for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; + trainIdx += blockDim.y, trainDescs += trainDescsStep) + { + if (m(queryIdx, trainIdx)) + { + reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row); + + if (threadIdx.x == 0) + { + float reg = sdiff_row[0]; + if (reg < myMin) + { + myMin = reg; + myBestTrainIdx = trainIdx; + myBestImgIdx = imgIdx; + } + } + } + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Train collection loop strategy + + template + class SingleTrain + { + public: + explicit SingleTrain(const DevMem2D_& trainDescs_) : trainDescs(trainDescs_) + { + } + + template + __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, + float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const + { + matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, + sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + } + + __device__ int desc_len() const + { + return trainDescs.cols; + } + private: + DevMem2D_ trainDescs; + }; + + template + class TrainCollection + { + public: + TrainCollection(const DevMem2D_* trainCollection_, int nImg_, int desclen_) : + trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_) + { + } + + template + __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, + float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const + { + for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) + { + DevMem2D_ trainDescs = trainCollection[imgIdx]; + m.nextMask(); + matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, + sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + } + } + + __device__ int desc_len() const + { + return desclen; + } + private: + const DevMem2D_* trainCollection; + int nImg; + int desclen; + }; + + /////////////////////////////////////////////////////////////////////////////// + // Match kernel + + template + __global__ void match(PtrStep_ queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance) + { + __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ float smin[64]; + __shared__ int strainIdx[64]; + __shared__ int simgIdx[64]; + + const int queryIdx = blockIdx.x; + + int myBestTrainIdx = -1; + int myBestImgIdx = -1; + float myMin = numeric_limits_gpu::max(); + + { + float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; + Mask m = mask; + ReduceDescCalculator reduceDescCalc; + reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff); + + train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + } + + findBestMatch(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, + smin, strainIdx, simgIdx, trainIdx, imgIdx, distance); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match kernel callers + + template class Dist, typename T, + typename Train, typename Mask> + void match_caller(const DevMem2D_& queryDescs, const Train& train, + const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + { + StaticAssert::check(); // blockDimY vals must reduce by warp + + dim3 grid(queryDescs.rows, 1, 1); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + match, T>, + Dist, T><<>>(queryDescs, train, mask, trainIdx.data, + imgIdx.data, distance.data); + + cudaSafeCall( cudaThreadSynchronize() ); + } + template class Dist, typename T, typename Train, typename Mask> + void match_smem_caller(const DevMem2D_& queryDescs, const Train& train, + const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + { + StaticAssert::check(); // blockDimY vals must reduce by warp + StaticAssert= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length + StaticAssert::check(); // max descriptors length must divide to blockDimX + + dim3 grid(queryDescs.rows, 1, 1); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + match, T>, + Dist, T><<>>(queryDescs, train, mask, trainIdx.data, + imgIdx.data, distance.data); + + cudaSafeCall( cudaThreadSynchronize() ); + } + + /////////////////////////////////////////////////////////////////////////////// + // Match kernel chooser + + template