From f11efdced384a894b3988596956135748b0756bb Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 16 May 2011 08:38:27 +0000 Subject: [PATCH] added support of Hamming distance to BruteForceMatcher_GPU --- modules/gpu/include/opencv2/gpu/gpu.hpp | 14 +- modules/gpu/src/brute_force_matcher.cpp | 46 ++- modules/gpu/src/cuda/brute_force_matcher.cu | 441 ++++++++++++++++++---------- 3 files changed, 335 insertions(+), 166 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 874666c..7e1c226 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1382,7 +1382,7 @@ namespace cv class CV_EXPORTS BruteForceMatcher_GPU_base { public: - enum DistType {L1Dist = 0, L2Dist}; + enum DistType {L1Dist = 0, L2Dist, HammingDist}; explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist); @@ -1522,6 +1522,18 @@ namespace cv explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {} explicit BruteForceMatcher_GPU(L2 /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {} }; + template <> class CV_EXPORTS BruteForceMatcher_GPU< HammingLUT > : public BruteForceMatcher_GPU_base + { + public: + explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {} + explicit BruteForceMatcher_GPU(HammingLUT /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {} + }; + template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base + { + public: + explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {} + explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {} + }; ////////////////////////////////// CascadeClassifier_GPU ////////////////////////////////////////// // The cascade classifier class for object detection. diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 648522d..9166bcb 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -83,14 +83,20 @@ namespace cv { namespace gpu { namespace bfmatcher const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template + void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12); + template + void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template @@ -99,6 +105,9 @@ namespace cv { namespace gpu { namespace bfmatcher template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template + void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, @@ -106,6 +115,9 @@ namespace cv { namespace gpu { namespace bfmatcher template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template + void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); }}} namespace @@ -167,7 +179,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - static const match_caller_t match_callers[2][8] = + static const match_caller_t match_callers[3][8] = { { matchSingleL1_gpu, matchSingleL1_gpu, @@ -178,6 +190,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, 0, 0 + }, + { + matchSingleHamming_gpu, matchSingleHamming_gpu, + matchSingleHamming_gpu, matchSingleHamming_gpu, + matchSingleHamming_gpu, 0, 0, 0 } }; @@ -295,7 +312,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); - static const match_caller_t match_callers[2][8] = + static const match_caller_t match_callers[3][8] = { { matchCollectionL1_gpu, matchCollectionL1_gpu, @@ -306,6 +323,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, 0, 0 + }, + { + matchCollectionHamming_gpu, matchCollectionHamming_gpu, + matchCollectionHamming_gpu, matchCollectionHamming_gpu, + matchCollectionHamming_gpu, 0, 0, 0 } }; @@ -391,7 +413,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - static const match_caller_t match_callers[2][8] = + static const match_caller_t match_callers[3][8] = { { knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, @@ -400,6 +422,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con { knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, 0, 0 + }, + { + knnMatchHamming_gpu, knnMatchHamming_gpu, knnMatchHamming_gpu, + knnMatchHamming_gpu, knnMatchHamming_gpu, 0, 0, 0 } }; @@ -531,7 +557,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - static const radiusMatch_caller_t radiusMatch_callers[2][8] = + static const radiusMatch_caller_t radiusMatch_callers[3][8] = { { radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, @@ -540,6 +566,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, { radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, 0, 0 + }, + { + radiusMatchHamming_gpu, radiusMatchHamming_gpu, radiusMatchHamming_gpu, + radiusMatchHamming_gpu, radiusMatchHamming_gpu, 0, 0, 0 } }; diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index 0e9752e..f0a8995 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -103,30 +103,61 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Reduce Sum - - template __device__ void reduceSum(float* sdiff_row, float& mySum); - template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum) + template struct SumReductor; + template <> struct SumReductor<16> { - volatile float* smem = sdiff_row; - - smem[threadIdx.x] = mySum; - - if (threadIdx.x < 8) + template static __device__ void reduce(T* sdiff_row, T& mySum) { - smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; - smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; - smem[threadIdx.x] = mySum += smem[threadIdx.x + 2]; - smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; + volatile T* smem = sdiff_row; + + smem[threadIdx.x] = mySum; + + if (threadIdx.x < 8) + { + smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 2]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; + } } - } + }; /////////////////////////////////////////////////////////////////////////////// // Distance - class L1Dist + template class L1Dist { public: + typedef int ResultType; + typedef int ValueType; + + __device__ L1Dist() : mySum(0) {} + + __device__ void reduceIter(int val1, int val2) + { + mySum = __sad(val1, val2, mySum); + } + + template __device__ void reduceAll(int* sdiff_row) + { + SumReductor::reduce(sdiff_row, mySum); + } + + __device__ operator int() const + { + return mySum; + } + + private: + int mySum; + }; + template <> class L1Dist + { + public: + typedef float ResultType; + typedef float ValueType; + __device__ L1Dist() : mySum(0.0f) {} __device__ void reduceIter(float val1, float val2) @@ -134,10 +165,9 @@ namespace cv { namespace gpu { namespace bfmatcher mySum += fabs(val1 - val2); } - template - __device__ void reduceAll(float* sdiff_row) + template __device__ void reduceAll(float* sdiff_row) { - reduceSum(sdiff_row, mySum); + SumReductor::reduce(sdiff_row, mySum); } __device__ operator float() const @@ -152,6 +182,9 @@ namespace cv { namespace gpu { namespace bfmatcher class L2Dist { public: + typedef float ResultType; + typedef float ValueType; + __device__ L2Dist() : mySum(0.0f) {} __device__ void reduceIter(float val1, float val2) @@ -160,10 +193,9 @@ namespace cv { namespace gpu { namespace bfmatcher mySum += reg * reg; } - template - __device__ void reduceAll(float* sdiff_row) + template __device__ void reduceAll(float* sdiff_row) { - reduceSum(sdiff_row, mySum); + SumReductor::reduce(sdiff_row, mySum); } __device__ operator float() const @@ -174,13 +206,39 @@ namespace cv { namespace gpu { namespace bfmatcher private: float mySum; }; + + class HammingDist + { + public: + typedef int ResultType; + typedef int ValueType; + + __device__ HammingDist() : mySum(0) {} + + __device__ void reduceIter(int val1, int val2) + { + mySum += __popc(val1 ^ val2); + } + + template __device__ void reduceAll(int* sdiff_row) + { + SumReductor::reduce(sdiff_row, mySum); + } + + __device__ operator int() const + { + return mySum; + } + + private: + int mySum; + }; /////////////////////////////////////////////////////////////////////////////// // reduceDescDiff template - __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, - float* sdiff_row) + __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) { for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X) dist.reduceIter(queryDescs[i], trainDescs[i]); @@ -195,14 +253,14 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // loadDescsVals - template - __device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem) + template + __device__ void loadDescsVals(const T* descs, int desc_len, U* queryVals, U* smem) { const int tid = threadIdx.y * blockDim.x + threadIdx.x; if (tid < desc_len) { - smem[tid] = (float)descs[tid]; + smem[tid] = descs[tid]; } __syncthreads(); @@ -220,8 +278,7 @@ namespace cv { namespace gpu { namespace bfmatcher template struct UnrollDescDiff { template - static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, - Dist& dist, int ind) + static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind) { if (ind < desc_len) { @@ -234,7 +291,7 @@ namespace cv { namespace gpu { namespace bfmatcher } template - static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) + static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) { dist.reduceIter(*queryVals, *trainDescs); @@ -247,13 +304,13 @@ namespace cv { namespace gpu { namespace bfmatcher template <> struct UnrollDescDiff<0> { template - static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, + static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind) { } template - static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) + static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) { } }; @@ -263,29 +320,25 @@ namespace cv { namespace gpu { namespace bfmatcher struct DescDiffCalculator { template - static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) + static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) { - UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, - dist, threadIdx.x); + UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x); } }; template struct DescDiffCalculator { template - static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) + static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) { - UnrollDescDiff::calcWithoutCheck(queryVals, - trainDescs + threadIdx.x, dist); + UnrollDescDiff::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist); } }; template - __device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist, - float* sdiff_row) + __device__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) { - DescDiffCalculator::calc(queryVals, - trainDescs, desc_len, dist); + DescDiffCalculator::calc(queryVals, trainDescs, desc_len, dist); dist.reduceAll(sdiff_row); } @@ -293,62 +346,60 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // warpReduceMinIdxIdx - template - __device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, - volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx); - - template <> - __device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, - volatile float* smin, volatile int* strainIdx, volatile int* simgIdx) + template struct MinIdxIdxWarpReductor; + template <> struct MinIdxIdxWarpReductor<16> { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < 8) + template + static __device__ void reduce(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, volatile T* smin, volatile int* strainIdx, volatile int* simgIdx) { - myMin = smin[tid]; - myBestTrainIdx = strainIdx[tid]; - myBestImgIdx = simgIdx[tid]; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; - float reg = smin[tid + 8]; - if (reg < myMin) + if (tid < 8) { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8]; - } + myMin = smin[tid]; + myBestTrainIdx = strainIdx[tid]; + myBestImgIdx = simgIdx[tid]; - reg = smin[tid + 4]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4]; - } - - reg = smin[tid + 2]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2]; - } - - reg = smin[tid + 1]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1]; + float reg = smin[tid + 8]; + if (reg < myMin) + { + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8]; + } + + reg = smin[tid + 4]; + if (reg < myMin) + { + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4]; + } + + reg = smin[tid + 2]; + if (reg < myMin) + { + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2]; + } + + reg = smin[tid + 1]; + if (reg < myMin) + { + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1]; + } } } - } + }; /////////////////////////////////////////////////////////////////////////////// // findBestMatch - template - __device__ void findBestMatch(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, - float* smin, int* strainIdx, int* simgIdx) + template + __device__ void findBestMatch(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, T* smin, int* strainIdx, int* simgIdx) { if (threadIdx.x == 0) { @@ -358,7 +409,7 @@ namespace cv { namespace gpu { namespace bfmatcher } __syncthreads(); - warpReduceMinIdxIdx(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); + MinIdxIdxWarpReductor::reduce(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); } /////////////////////////////////////////////////////////////////////////////// @@ -368,13 +419,13 @@ namespace cv { namespace gpu { namespace bfmatcher class ReduceDescCalculatorSimple { public: - __device__ void prepare(const T* queryDescs_, int, float*) + __device__ void prepare(const T* queryDescs_, int, void*) { queryDescs = queryDescs_; } template - __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const + __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const { reduceDescDiff(queryDescs, trainDescs, desc_len, dist, sdiff_row); } @@ -383,24 +434,23 @@ namespace cv { namespace gpu { namespace bfmatcher const T* queryDescs; }; - template + template class ReduceDescCalculatorCached { public: - __device__ void prepare(const T* queryDescs, int desc_len, float* smem) + __device__ void prepare(const T* queryDescs, int desc_len, U* smem) { loadDescsVals(queryDescs, desc_len, queryVals, smem); } template - __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const + __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const { - reduceDescDiffCached(queryVals, trainDescs, - desc_len, dist, sdiff_row); + reduceDescDiffCached(queryVals, trainDescs, desc_len, dist, sdiff_row); } private: - float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X]; + U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X]; }; /////////////////////////////////////////////////////////////////////////////// @@ -409,7 +459,7 @@ namespace cv { namespace gpu { namespace bfmatcher template __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& trainDescs_, const Mask& m, const ReduceDescCalculator& reduceDescCalc, - float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) + typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) { for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y) { @@ -447,10 +497,9 @@ namespace cv { namespace gpu { namespace bfmatcher template __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const + typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const { - matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, - myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); + matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } __device__ int desc_len() const @@ -473,14 +522,13 @@ namespace cv { namespace gpu { namespace bfmatcher template __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const + typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const { for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) { DevMem2D_ trainDescs = trainCollection[imgIdx]; m.nextMask(); - matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, - myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); + matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } } @@ -498,38 +546,35 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Match kernel - template - __global__ void match(const PtrStep_ queryDescs_, const Train train, const Mask mask, - int* trainIdx, int* imgIdx, float* distance) + template + __global__ void match(const PtrStep_ queryDescs_, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) { - __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; const int queryIdx = blockIdx.x; int myBestTrainIdx = -1; int myBestImgIdx = -1; - float myMin = numeric_limits_gpu::max(); + typename Dist::ResultType myMin = numeric_limits_gpu::max(); { - float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; + typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; Mask m = mask; ReduceDescCalculator reduceDescCalc; - reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem); + reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), (typename Dist::ValueType*)smem); train.template loop(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } __syncthreads(); - float* smin = smem; + typename Dist::ResultType* smin = smem; int* strainIdx = (int*)(smin + BLOCK_DIM_Y); int* simgIdx = strainIdx + BLOCK_DIM_Y; - findBestMatch(myMin, myBestTrainIdx, myBestImgIdx, - smin, strainIdx, simgIdx); + findBestMatch(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); if (threadIdx.x == 0 && threadIdx.y == 0) { @@ -542,8 +587,7 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Match kernel callers - template + template void matchSimple_caller(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) { @@ -553,14 +597,12 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); match, Dist, T> - <<>>(queryDescs, train, mask, trainIdx.data, - imgIdx.data, distance.data); + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); } - template + template void matchCached_caller(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) { @@ -571,11 +613,8 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 grid(queryDescs.rows, 1, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, - Dist, T> - <<>>(queryDescs, train, mask, trainIdx.data, - imgIdx.data, distance.data); + match, Dist, T> + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); @@ -616,11 +655,11 @@ namespace cv { namespace gpu { namespace bfmatcher if (mask.data) { SingleMask m(mask); - matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); } else { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } @@ -656,6 +695,29 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template + void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12) + { + SingleTrain train((DevMem2D_)trainDescs); + if (mask.data) + { + SingleMask m(mask); + matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + } + else + { + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + } + } + + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12) @@ -664,11 +726,11 @@ namespace cv { namespace gpu { namespace bfmatcher if (maskCollection.data) { MaskCollection mask(maskCollection.data); - matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); } else { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } @@ -702,6 +764,29 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + + template + void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, + const DevMem2Df& distance, bool cc_12) + { + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); + if (maskCollection.data) + { + MaskCollection mask(maskCollection.data); + matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + } + else + { + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + } + } + + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); /////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// Knn Match //////////////////////////////////// @@ -713,9 +798,9 @@ namespace cv { namespace gpu { namespace bfmatcher template __global__ void calcDistance(PtrStep_ queryDescs_, DevMem2D_ trainDescs_, Mask mask, PtrStepf distance) { - __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; - float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; + typename Dist::ResultType* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; const int queryIdx = blockIdx.x; const T* queryDescs = queryDescs_.ptr(queryIdx); @@ -726,7 +811,7 @@ namespace cv { namespace gpu { namespace bfmatcher { const T* trainDescs = trainDescs_.ptr(trainIdx); - float myDist = numeric_limits_gpu::max(); + typename Dist::ResultType myDist = numeric_limits_gpu::max(); if (mask(queryIdx, trainIdx)) { @@ -763,14 +848,14 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // warpReduceMinIdx - template - __device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid) + template + __device__ void warpReduceMinIdx(volatile T* sdist, volatile int* strainIdx, T& myMin, int tid) { if (tid < 32) { if (BLOCK_SIZE >= 64) { - float reg = sdist[tid + 32]; + T reg = sdist[tid + 32]; if (reg < myMin) { @@ -780,7 +865,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 32) { - float reg = sdist[tid + 16]; + T reg = sdist[tid + 16]; if (reg < myMin) { @@ -790,7 +875,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 16) { - float reg = sdist[tid + 8]; + T reg = sdist[tid + 8]; if (reg < myMin) { @@ -800,7 +885,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 8) { - float reg = sdist[tid + 4]; + T reg = sdist[tid + 4]; if (reg < myMin) { @@ -810,7 +895,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 4) { - float reg = sdist[tid + 2]; + T reg = sdist[tid + 2]; if (reg < myMin) { @@ -820,7 +905,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 2) { - float reg = sdist[tid + 1]; + T reg = sdist[tid + 1]; if (reg < myMin) { @@ -831,17 +916,17 @@ namespace cv { namespace gpu { namespace bfmatcher } } - template - __device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx) + template + __device__ void reduceMinIdx(const T* dist, int n, T* sdist, int* strainIdx) { const int tid = threadIdx.x; - float myMin = numeric_limits_gpu::max(); + T myMin = numeric_limits_gpu::max(); int myMinIdx = -1; for (int i = tid; i < n; i += BLOCK_SIZE) { - float reg = dist[i]; + T reg = dist[i]; if (reg < myMin) { myMin = reg; @@ -855,7 +940,7 @@ namespace cv { namespace gpu { namespace bfmatcher if (BLOCK_SIZE >= 512 && tid < 256) { - float reg = sdist[tid + 256]; + T reg = sdist[tid + 256]; if (reg < myMin) { @@ -866,7 +951,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 256 && tid < 128) { - float reg = sdist[tid + 128]; + T reg = sdist[tid + 128]; if (reg < myMin) { @@ -877,7 +962,7 @@ namespace cv { namespace gpu { namespace bfmatcher } if (BLOCK_SIZE >= 128 && tid < 64) { - float reg = sdist[tid + 64]; + T reg = sdist[tid + 64]; if (reg < myMin) { @@ -943,14 +1028,12 @@ namespace cv { namespace gpu { namespace bfmatcher // knn match caller template - void calcDistanceDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, - const Mask& mask, const DevMem2Df& allDist) + void calcDistanceDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, const DevMem2Df& allDist) { calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist); } - void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, - const DevMem2Df& allDist) + void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) { findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); } @@ -961,13 +1044,11 @@ namespace cv { namespace gpu { namespace bfmatcher { if (mask.data) { - calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - SingleMask(mask), allDist); + calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, SingleMask(mask), allDist); } else { - calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - WithOutMask(), allDist); + calcDistanceDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, WithOutMask(), allDist); } findKnnMatchDispatcher(knn, trainIdx, distance, allDist); @@ -1005,6 +1086,30 @@ namespace cv { namespace gpu { namespace bfmatcher template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template + void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) + { + if (mask.data) + { + calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + SingleMask(mask), allDist); + } + else + { + calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + WithOutMask(), allDist); + } + + findKnnMatchDispatcher(knn, trainIdx, distance, allDist); + } + + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + /////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////// Radius Match ////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// @@ -1018,9 +1123,9 @@ namespace cv { namespace gpu { namespace bfmatcher { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; + typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; const int queryIdx = blockIdx.x; const T* queryDescs = queryDescs_.ptr(queryIdx); @@ -1091,12 +1196,12 @@ namespace cv { namespace gpu { namespace bfmatcher { if (mask.data) { - radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + radiusMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, maxDistance, SingleMask(mask), trainIdx, nMatches, distance); } else { - radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + radiusMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, maxDistance, WithOutMask(), trainIdx, nMatches, distance); } } @@ -1130,4 +1235,26 @@ namespace cv { namespace gpu { namespace bfmatcher template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + + template + void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, + const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) + { + if (mask.data) + { + radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + maxDistance, SingleMask(mask), trainIdx, nMatches, distance); + } + else + { + radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + maxDistance, WithOutMask(), trainIdx, nMatches, distance); + } + } + + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); }}} -- 2.7.4