From: Vladislav Vinogradov Date: Mon, 31 Jan 2011 13:20:52 +0000 (+0000) Subject: fixed gpu tests (BruteForceMatcher_GPU, divide, phase, cartToPolar, async) X-Git-Tag: accepted/2.0/20130307.220821~3585 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8274ed22e42c7b7c0ff6c05708208a3b6cb75259;p=profile%2Fivi%2Fopencv.git fixed gpu tests (BruteForceMatcher_GPU, divide, phase, cartToPolar, async) minor code refactoring --- diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index b719417..2531d1e 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -671,10 +671,12 @@ namespace cv //! output will have CV_32FC1 type CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect); - //! applies Canny edge detector and produces the edge map - //! supprots only CV_8UC1 source type - //! disabled until fix crash - CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); + // applies Canny edge detector and produces the edge map + // disabled until fix crash + //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); + //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3); + //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3); + //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3); //! computes Harris cornerness criteria at each image pixel CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 3447121..4806e67 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -104,6 +104,18 @@ namespace cv { namespace gpu { namespace bfmatcher const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); }}} +namespace +{ + class ImgIdxSetter + { + public: + ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} + void operator()(DMatch& m) const {m.imgIdx = imgIdx;} + private: + int imgIdx; + }; +} + cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) { } @@ -185,7 +197,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, return; CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous()); - CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.size().area() == trainIdx.size().area()); + CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.cols == trainIdx.cols); const int nQuery = trainIdx.cols; @@ -309,8 +321,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, return; CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous()); - CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous()); - CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous()); + CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous() && imgIdx.cols == trainIdx.cols); + CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && imgIdx.cols == trainIdx.cols); const int nQuery = trainIdx.cols; @@ -390,7 +402,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con trainIdx.setTo(Scalar::all(-1)); distance.create(nQuery, k, CV_32F); - allDist.create(nQuery, nTrain, CV_32F); + ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); @@ -451,18 +463,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con knnMatchDownload(trainIdx, distance, matches, compactResult); } -namespace -{ - class ImgIdxSetter - { - public: - ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} - void operator()(DMatch& m) const {m.imgIdx = imgIdx;} - private: - int imgIdx; - }; -} - void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, vector< vector >& matches, int knn, const vector& masks, bool compactResult) { @@ -538,9 +538,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F); CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols); - CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery); + CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); - nMatches.create(1, nQuery, CV_32SC1); + ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); nMatches.setTo(Scalar::all(0)); if (trainIdx.empty()) { @@ -561,7 +561,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai return; CV_Assert(trainIdx.type() == CV_32SC1); - CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.size().area() == trainIdx.rows); + CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows); CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size()); const int nQuery = trainIdx.rows; diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index b28aee1..44f823d 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -64,6 +64,7 @@ namespace cv { namespace gpu { namespace bfmatcher { return mask.ptr(queryIdx)[trainIdx] != 0; } + private: PtrStep mask; }; @@ -82,6 +83,7 @@ namespace cv { namespace gpu { namespace bfmatcher { return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0; } + private: PtrStep* maskCollection; PtrStep curMask; @@ -102,172 +104,55 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Reduce Sum - template - __device__ void reduceSum(float* sdiff, float mySum, int tid) - { - sdiff[tid] = mySum; - __syncthreads(); + template __device__ void reduceSum(float* sdiff_row, float& mySum); - if (BLOCK_DIM_X == 512) - { - if (tid < 256) - { - sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); - sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); - sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); - } - volatile float* smem = sdiff; - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - if (BLOCK_DIM_X == 256) - { - if (tid < 128) - { - sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); - sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); - } - volatile float* smem = sdiff; - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - if (BLOCK_DIM_X == 128) - { - if (tid < 64) - { - sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads(); - } - volatile float* smem = sdiff; - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - - volatile float* smem = sdiff; - if (BLOCK_DIM_X == 64) - { - if (tid < 32) - { - smem[tid] = mySum += smem[tid + 32]; - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 32) - { - if (tid < 16) - { - smem[tid] = mySum += smem[tid + 16]; - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 16) - { - if (tid < 8) - { - smem[tid] = mySum += smem[tid + 8]; - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 8) - { - if (tid < 4) - { - smem[tid] = mySum += smem[tid + 4]; - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 4) - { - if (tid < 2) - { - smem[tid] = mySum += smem[tid + 2]; - smem[tid] = mySum += smem[tid + 1]; - } - } - if (BLOCK_DIM_X == 2) - { - if (tid < 1) - { - smem[tid] = mySum += smem[tid + 1]; - } - } - } - - /////////////////////////////////////////////////////////////////////////////// - // loadDescsVals - - template - __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals) + template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum) { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; + volatile float* smem = sdiff_row; - if (tid < desc_len) + smem[threadIdx.x] = mySum; + + if (threadIdx.x < 8) { - smem[tid] = (float)descs[tid]; - } - __syncthreads(); - - #pragma unroll - for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X) - { - *queryVals = smem[i]; - ++queryVals; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 2]; + smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; } } /////////////////////////////////////////////////////////////////////////////// // Distance - template class L1Dist { public: - __device__ L1Dist() : mySum(0) {} + __device__ L1Dist() : mySum(0.0f) {} __device__ void reduceIter(float val1, float val2) { mySum += fabs(val1 - val2); } - __device__ void reduceAll(float* sdiff, int tid) + template + __device__ void reduceAll(float* sdiff_row) { - reduceSum(sdiff, mySum, tid); + reduceSum(sdiff_row, mySum); } - static __device__ float finalResult(float res) + __device__ operator float() const { - return res; + return mySum; } + private: float mySum; }; - template class L2Dist { public: - __device__ L2Dist() : mySum(0) {} + __device__ L2Dist() : mySum(0.0f) {} __device__ void reduceIter(float val1, float val2) { @@ -275,15 +160,17 @@ namespace cv { namespace gpu { namespace bfmatcher mySum += reg * reg; } - __device__ void reduceAll(float* sdiff, int tid) + template + __device__ void reduceAll(float* sdiff_row) { - reduceSum(sdiff, mySum, tid); + reduceSum(sdiff_row, mySum); } - static __device__ float finalResult(float res) + __device__ operator float() const { - return sqrtf(res); + return sqrtf(mySum); } + private: float mySum; }; @@ -292,56 +179,81 @@ namespace cv { namespace gpu { namespace bfmatcher // reduceDescDiff template - __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff) + __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, + float* sdiff_row) { - const int tid = threadIdx.x; + for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X) + dist.reduceIter(queryDescs[i], trainDescs[i]); - Dist dist; + dist.reduceAll(sdiff_row); + } - for (int i = tid; i < desc_len; i += BLOCK_DIM_X) - dist.reduceIter(queryDescs[i], trainDescs[i]); +/////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////// Match ////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////////////////////////////////////////////////// + // loadDescsVals + + template + __device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem) + { + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + if (tid < desc_len) + { + smem[tid] = (float)descs[tid]; + } + __syncthreads(); - dist.reduceAll(sdiff, tid); + #pragma unroll + for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X) + { + *queryVals = smem[i]; + ++queryVals; + } } /////////////////////////////////////////////////////////////////////////////// - // reduceDescDiff_smem + // reduceDescDiffCached template struct UnrollDescDiff { template - static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, - int ind, int desc_len) + static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, + Dist& dist, int ind) { if (ind < desc_len) + { dist.reduceIter(*queryVals, trainDescs[ind]); - ++queryVals; + ++queryVals; - UnrollDescDiff::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len); + UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, dist, ind + blockDim.x); + } } template - static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs) + static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) { dist.reduceIter(*queryVals, *trainDescs); ++queryVals; trainDescs += blockDim.x; - UnrollDescDiff::calcWithoutCheck(dist, queryVals, trainDescs); + UnrollDescDiff::calcWithoutCheck(queryVals, trainDescs, dist); } }; template <> struct UnrollDescDiff<0> { template - static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs, - int ind, int desc_len) + static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len, + Dist& dist, int ind) { } template - static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs) + static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist) { } }; @@ -351,106 +263,82 @@ namespace cv { namespace gpu { namespace bfmatcher struct DescDiffCalculator { template - static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len) + static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) { - UnrollDescDiff::calcCheck(dist, queryVals, trainDescs, - threadIdx.x, desc_len); + UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, + dist, threadIdx.x); } }; template struct DescDiffCalculator { template - static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len) + static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist) { - UnrollDescDiff::calcWithoutCheck(dist, queryVals, - trainDescs + threadIdx.x); + UnrollDescDiff::calcWithoutCheck(queryVals, + trainDescs + threadIdx.x, dist); } }; template - __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff) - { - const int tid = threadIdx.x; + __device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist, + float* sdiff_row) + { + DescDiffCalculator::calc(queryVals, + trainDescs, desc_len, dist); - Dist dist; - - DescDiffCalculator::calc(dist, queryVals, - trainDescs, desc_len); - - dist.reduceAll(sdiff, tid); + dist.reduceAll(sdiff_row); } -/////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////// Match ////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////// - // warpReduceMin + // warpReduceMinIdxIdx template - __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx) + __device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, + volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx); + + template <> + __device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, + volatile float* smin, volatile int* strainIdx, volatile int* simgIdx) { - float minSum = sdata[tid]; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; - if (BLOCK_DIM_Y >= 64) - { - float reg = sdata[tid + 32]; - if (reg < minSum) - { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 32]; - simgIdx[tid] = simgIdx[tid + 32]; - } - } - if (BLOCK_DIM_Y >= 32) + if (tid < 8) { - float reg = sdata[tid + 16]; - if (reg < minSum) - { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 16]; - simgIdx[tid] = simgIdx[tid + 16]; - } - } - if (BLOCK_DIM_Y >= 16) - { - float reg = sdata[tid + 8]; - if (reg < minSum) + myMin = smin[tid]; + myBestTrainIdx = strainIdx[tid]; + myBestImgIdx = simgIdx[tid]; + + float reg = smin[tid + 8]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 8]; - simgIdx[tid] = simgIdx[tid + 8]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8]; } - } - if (BLOCK_DIM_Y >= 8) - { - float reg = sdata[tid + 4]; - if (reg < minSum) + + reg = smin[tid + 4]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 4]; - simgIdx[tid] = simgIdx[tid + 4]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4]; } - } - if (BLOCK_DIM_Y >= 4) - { - float reg = sdata[tid + 2]; - if (reg < minSum) + + reg = smin[tid + 2]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 2]; - simgIdx[tid] = simgIdx[tid + 2]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2]; } - } - if (BLOCK_DIM_Y >= 2) - { - float reg = sdata[tid + 1]; - if (reg < minSum) + + reg = smin[tid + 1]; + if (reg < myMin) { - sdata[tid] = minSum = reg; - strainIdx[tid] = strainIdx[tid + 1]; - simgIdx[tid] = simgIdx[tid + 1]; + smin[tid] = myMin = reg; + strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1]; + simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1]; } } } @@ -458,9 +346,9 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // findBestMatch - template - __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, - float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance) + template + __device__ void findBestMatch(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, + float* smin, int* strainIdx, int* simgIdx) { if (threadIdx.x == 0) { @@ -470,27 +358,13 @@ namespace cv { namespace gpu { namespace bfmatcher } __syncthreads(); - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < 32) - warpReduceMin(tid, smin, strainIdx, simgIdx); - - if (threadIdx.x == 0 && threadIdx.y == 0) - { - float minSum = smin[0]; - int bestTrainIdx = strainIdx[0]; - int bestImgIdx = simgIdx[0]; - - imgIdx[queryIdx] = bestImgIdx; - trainIdx[queryIdx] = bestTrainIdx; - distance[queryIdx] = Dist::finalResult(minSum); - } + warpReduceMinIdxIdx(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); } /////////////////////////////////////////////////////////////////////////////// // ReduceDescCalculator - template + template class ReduceDescCalculatorSimple { public: @@ -499,29 +373,30 @@ namespace cv { namespace gpu { namespace bfmatcher queryDescs = queryDescs_; } - __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const + template + __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const { - reduceDescDiff(queryDescs, trainDescs, desc_len, sdiff_row); + reduceDescDiff(queryDescs, trainDescs, desc_len, dist, sdiff_row); } private: const T* queryDescs; }; - template - class ReduceDescCalculatorSmem + template + class ReduceDescCalculatorCached { public: __device__ void prepare(const T* queryDescs, int desc_len, float* smem) { - loadDescsVals(queryDescs, desc_len, smem, queryVals); + loadDescsVals(queryDescs, desc_len, queryVals, smem); } - __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const + template + __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const { - reduceDescDiff_smem(queryVals, trainDescs, - desc_len, sdiff_row); + reduceDescDiffCached(queryVals, trainDescs, + desc_len, dist, sdiff_row); } private: @@ -531,26 +406,26 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // matchDescs loop - template - __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_& trainDescs_, + template + __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& trainDescs_, const Mask& m, const ReduceDescCalculator& reduceDescCalc, - float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) + float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) { - const T* trainDescs = trainDescs_.ptr(threadIdx.y); - const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T); - for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; - trainIdx += blockDim.y, trainDescs += trainDescsStep) + for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y) { if (m(queryIdx, trainIdx)) { - reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row); + const T* trainDescs = trainDescs_.ptr(trainIdx); + + Dist dist; + + reduceDescCalc.calc(trainDescs, trainDescs_.cols, dist, sdiff_row); if (threadIdx.x == 0) { - float reg = sdiff_row[0]; - if (reg < myMin) + if (dist < myMin) { - myMin = reg; + myMin = dist; myBestTrainIdx = trainIdx; myBestImgIdx = imgIdx; } @@ -570,18 +445,19 @@ namespace cv { namespace gpu { namespace bfmatcher { } - template + template __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const + float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const { - matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, - sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, + myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } __device__ int desc_len() const { return trainDescs.cols; } + private: DevMem2D_ trainDescs; }; @@ -595,16 +471,16 @@ namespace cv { namespace gpu { namespace bfmatcher { } - template + template __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const + float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const { for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) { DevMem2D_ trainDescs = trainCollection[imgIdx]; m.nextMask(); - matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, - sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, + myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } } @@ -612,6 +488,7 @@ namespace cv { namespace gpu { namespace bfmatcher { return desclen; } + private: const DevMem2D_* trainCollection; int nImg; @@ -623,12 +500,10 @@ namespace cv { namespace gpu { namespace bfmatcher template - __global__ void match(PtrStep_ queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance) + __global__ void match(const PtrStep_ queryDescs_, const Train train, const Mask mask, + int* trainIdx, int* imgIdx, float* distance) { - __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; - __shared__ float smin[64]; - __shared__ int strainIdx[64]; - __shared__ int simgIdx[64]; + __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y]; const int queryIdx = blockIdx.x; @@ -637,24 +512,39 @@ namespace cv { namespace gpu { namespace bfmatcher float myMin = numeric_limits_gpu::max(); { - float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; + float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; + Mask m = mask; + ReduceDescCalculator reduceDescCalc; - reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff); + + reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem); - train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx); + train.template loop(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } + __syncthreads(); + + float* smin = smem; + int* strainIdx = (int*)(smin + BLOCK_DIM_Y); + int* simgIdx = strainIdx + BLOCK_DIM_Y; - findBestMatch(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, - smin, strainIdx, simgIdx, trainIdx, imgIdx, distance); + findBestMatch(myMin, myBestTrainIdx, myBestImgIdx, + smin, strainIdx, simgIdx); + + if (threadIdx.x == 0 && threadIdx.y == 0) + { + imgIdx[queryIdx] = myBestImgIdx; + trainIdx[queryIdx] = myBestTrainIdx; + distance[queryIdx] = myMin; + } } /////////////////////////////////////////////////////////////////////////////// // Match kernel callers - template class Dist, typename T, + template - void match_caller(const DevMem2D_& queryDescs, const Train& train, + void matchSimple_caller(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) { StaticAssert::check(); // blockDimY vals must reduce by warp @@ -662,15 +552,15 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 grid(queryDescs.rows, 1, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, T>, - Dist, T><<>>(queryDescs, train, mask, trainIdx.data, + match, Dist, T> + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaThreadSynchronize() ); } template class Dist, typename T, typename Train, typename Mask> - void match_smem_caller(const DevMem2D_& queryDescs, const Train& train, + typename Dist, typename T, typename Train, typename Mask> + void matchCached_caller(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) { StaticAssert::check(); // blockDimY vals must reduce by warp @@ -680,9 +570,10 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 grid(queryDescs.rows, 1, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, T>, - Dist, T><<>>(queryDescs, train, mask, trainIdx.data, + match, + Dist, T> + <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaThreadSynchronize() ); @@ -691,24 +582,24 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // Match kernel chooser - template