From 5f9e47a9cd5a94e19499b4662b057ab6916afda5 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 29 Aug 2011 11:31:52 +0000 Subject: [PATCH] refactored gpu::BruteForceMatcher (moved some utility functions to device layer) --- modules/gpu/src/brute_force_matcher.cpp | 136 +- modules/gpu/src/cuda/brute_force_matcher.cu | 1403 +++++++------------- modules/gpu/src/cuda/surf.cu | 4 +- modules/gpu/src/opencv2/gpu/device/color.hpp | 2 +- .../gpu/src/opencv2/gpu/device/datamov_utils.hpp | 9 +- .../device/detail/{color.hpp => color_detail.hpp} | 0 .../detail/{transform.hpp => transform_detail.hpp} | 0 .../gpu/device/detail/type_traits_detail.hpp | 186 +++ .../opencv2/gpu/device/detail/utility_detail.hpp | 576 ++++++++ modules/gpu/src/opencv2/gpu/device/emulation.hpp | 2 +- modules/gpu/src/opencv2/gpu/device/functional.hpp | 290 +++- .../gpu/src/opencv2/gpu/device/saturate_cast.hpp | 74 +- modules/gpu/src/opencv2/gpu/device/transform.hpp | 22 +- modules/gpu/src/opencv2/gpu/device/type_traits.hpp | 80 ++ modules/gpu/src/opencv2/gpu/device/utility.hpp | 319 +++-- modules/gpu/src/opencv2/gpu/device/vec_math.hpp | 106 +- modules/gpu/test/test_imgproc.cpp | 17 +- samples/gpu/performance/tests.cpp | 4 +- 18 files changed, 1990 insertions(+), 1240 deletions(-) rename modules/gpu/src/opencv2/gpu/device/detail/{color.hpp => color_detail.hpp} (100%) rename modules/gpu/src/opencv2/gpu/device/detail/{transform.hpp => transform_detail.hpp} (100%) create mode 100644 modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp create mode 100644 modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp create mode 100644 modules/gpu/src/opencv2/gpu/device/type_traits.hpp diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index d6bbb7e..f4f53fc 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -78,60 +78,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector namespace cv { namespace gpu { namespace bfmatcher { - template - void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); - template - void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); - template - void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); - template - void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); - template - void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); - template - void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); - - template - void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template - void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template - void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - - template - void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template - void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template - void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + + template void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); + + template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream); + + template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream); }}} namespace { - class ImgIdxSetter + struct ImgIdxSetter { - public: - ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} - void operator()(DMatch& m) const {m.imgIdx = imgIdx;} - private: + explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {} + inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;} int imgIdx; }; } @@ -179,9 +172,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, using namespace cv::gpu::bfmatcher; - typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream); + typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -213,11 +206,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); + DeviceInfo info; + int cc = info.majorVersion() * 10 + info.minorVersion(); - // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. - // trainIdx store after imgIdx, so we doesn't lose it value. - func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream)); + func(queryDescs, trainDescs, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector& matches) @@ -319,9 +311,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes using namespace cv::gpu::bfmatcher; - typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -353,9 +345,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); + DeviceInfo info; + int cc = info.majorVersion() * 10 + info.minorVersion(); - func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream)); + func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector& matches) @@ -427,8 +420,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con using namespace cv::gpu::bfmatcher; - typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream); static const match_caller_t match_callers[3][8] = { @@ -473,9 +467,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); + DeviceInfo info; + int cc = info.majorVersion() * 10 + info.minorVersion(); - func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc_12, StreamAccessor::getStream(stream)); + func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -563,7 +558,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, vector& localMatch = curMatches[queryIdx]; vector& globalMatch = matches[queryIdx]; - for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx)); + for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast(imgIdx))); temp.clear(); merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp)); @@ -593,8 +588,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, using namespace cv::gpu::bfmatcher; - typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream); static const radiusMatch_caller_t radiusMatch_callers[3][8] = { @@ -636,7 +632,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr(), distance, StreamAccessor::getStream(stream)); + func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, @@ -728,7 +724,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector& localMatch = curMatches[queryIdx]; vector& globalMatch = matches[queryIdx]; - for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx)); + for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast(imgIdx))); const size_t oldSize = globalMatch.size(); diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index 4cd1142..c2c7317 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -42,461 +42,83 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/datamov_utils.hpp" +#include "opencv2/gpu/device/utility.hpp" using namespace cv::gpu; using namespace cv::gpu::device; namespace cv { namespace gpu { namespace bfmatcher { -/////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////// General funcs ////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////////////////////////////////////////////////////////// - // Mask strategy - - struct SingleMask - { - explicit SingleMask(const PtrStep& mask_) : mask(mask_) {} - - __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const - { - return mask.ptr(queryIdx)[trainIdx] != 0; - } - - const PtrStep mask; - }; - - struct MaskCollection - { - explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {} - - __device__ __forceinline__ void nextMask() - { - curMask = *maskCollection++; - } - - __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const - { - uchar val; - return curMask.data == 0 || (ForceGlob::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0)); - } - - const PtrStep* maskCollection; - PtrStep curMask; - }; - - struct WithOutMask - { - __device__ __forceinline__ void nextMask() const - { - } - __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const - { - return true; - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // Reduce Sum - - template struct SumReductor; - template <> struct SumReductor<16> - { - template static __device__ void reduce(volatile T* sdiff_row, T& mySum) - { - sdiff_row[threadIdx.x] = mySum; - - if (threadIdx.x < 8) - { - sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 8]; - sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 4]; - sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 2]; - sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 1]; - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // Distance - - template struct L1Dist - { - typedef int ResultType; - typedef int ValueType; - - __device__ __forceinline__ L1Dist() : mySum(0) {} - - __device__ __forceinline__ void reduceIter(int val1, int val2) - { - mySum = __sad(val1, val2, mySum); - } - - template __device__ __forceinline__ void reduceAll(int* sdiff_row) - { - SumReductor::reduce(sdiff_row, mySum); - } - - __device__ __forceinline__ operator int() const - { - return mySum; - } - - int mySum; - }; - template <> struct L1Dist - { - typedef float ResultType; - typedef float ValueType; - - __device__ __forceinline__ L1Dist() : mySum(0.0f) {} - - __device__ __forceinline__ void reduceIter(float val1, float val2) - { - mySum += fabs(val1 - val2); - } - - template __device__ __forceinline__ void reduceAll(float* sdiff_row) - { - SumReductor::reduce(sdiff_row, mySum); - } - - __device__ __forceinline__ operator float() const - { - return mySum; - } - - float mySum; - }; - - struct L2Dist - { - typedef float ResultType; - typedef float ValueType; - - __device__ __forceinline__ L2Dist() : mySum(0.0f) {} - - __device__ __forceinline__ void reduceIter(float val1, float val2) - { - float reg = val1 - val2; - mySum += reg * reg; - } - - template __device__ __forceinline__ void reduceAll(float* sdiff_row) - { - SumReductor::reduce(sdiff_row, mySum); - } - - __device__ __forceinline__ operator float() const - { - return sqrtf(mySum); - } - - float mySum; - }; - - struct HammingDist - { - typedef int ResultType; - typedef int ValueType; - - __device__ __forceinline__ HammingDist() : mySum(0) {} - - __device__ __forceinline__ void reduceIter(int val1, int val2) - { - mySum += __popc(val1 ^ val2); - } - - template __device__ __forceinline__ void reduceAll(int* sdiff_row) - { - SumReductor::reduce(sdiff_row, mySum); - } - - __device__ __forceinline__ operator int() const - { - return mySum; - } - - int mySum; - }; - - /////////////////////////////////////////////////////////////////////////////// - // reduceDescDiff - - template - __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) - { - for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X) - { - T trainVal; - ForceGlob::Load(trainDescs, i, trainVal); - dist.reduceIter(queryDescs[i], trainVal); - } - - dist.reduceAll(sdiff_row); - } /////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////// Match ////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////////////////////////////////////////////////////////// - // loadDescsVals - - template - __device__ void loadDescsVals(const T* descs, int desc_len, U* queryVals, U* smem) - { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < desc_len) - { - smem[tid] = descs[tid]; - } - __syncthreads(); - - #pragma unroll - for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X) - { - *queryVals = smem[i]; - ++queryVals; - } - } - - /////////////////////////////////////////////////////////////////////////////// - // reduceDescDiffCached - - template struct UnrollDescDiff - { - template - static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind) - { - if (ind < desc_len) - { - T trainVal; - ForceGlob::Load(trainDescs, ind, trainVal); - dist.reduceIter(*queryVals, trainVal); - - ++queryVals; - - UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, dist, ind + blockDim.x); - } - } - - template - static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) - { - T trainVal; - ForceGlob::Load(trainDescs, 0, trainVal); - dist.reduceIter(*queryVals, trainVal); - - ++queryVals; - trainDescs += blockDim.x; - - UnrollDescDiff::calcWithoutCheck(queryVals, trainDescs, dist); - } - }; - template <> struct UnrollDescDiff<0> - { - template - static __device__ __forceinline__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, - Dist& dist, int ind) - { - } - - template - static __device__ __forceinline__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) - { - } - }; - - template struct DescDiffCalculator; - template - struct DescDiffCalculator - { - template - static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) - { - UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x); - } - }; - template - struct DescDiffCalculator - { - template - static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) - { - UnrollDescDiff::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist); - } - }; - - template - __device__ __forceinline__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) - { - DescDiffCalculator::calc(queryVals, trainDescs, desc_len, dist); - - dist.reduceAll(sdiff_row); - } - - /////////////////////////////////////////////////////////////////////////////// - // warpReduceMinIdxIdx - - template struct MinIdxIdxWarpReductor; - template <> struct MinIdxIdxWarpReductor<16> - { - template - static __device__ void reduce(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, volatile T* smin, volatile int* strainIdx, volatile int* simgIdx) - { - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - if (tid < 8) - { - myMin = smin[tid]; - myBestTrainIdx = strainIdx[tid]; - myBestImgIdx = simgIdx[tid]; - - float reg = smin[tid + 8]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8]; - } - - reg = smin[tid + 4]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4]; - } - - reg = smin[tid + 2]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2]; - } - - reg = smin[tid + 1]; - if (reg < myMin) - { - smin[tid] = myMin = reg; - strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1]; - simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1]; - } - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // findBestMatch template - __device__ void findBestMatch(T& myMin, int& myBestTrainIdx, int& myBestImgIdx, T* smin, int* strainIdx, int* simgIdx) + __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) { if (threadIdx.x == 0) { - smin[threadIdx.y] = myMin; - strainIdx[threadIdx.y] = myBestTrainIdx; - simgIdx[threadIdx.y] = myBestImgIdx; + smin[threadIdx.y] = myDist; + sIdx[threadIdx.y] = myIdx; } __syncthreads(); - MinIdxIdxWarpReductor::reduce(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); + reducePredVal(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less()); } - - /////////////////////////////////////////////////////////////////////////////// - // ReduceDescCalculator - - template struct ReduceDescCalculatorSimple - { - __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*) - { - queryDescs = queryDescs_; - } - - template - __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const - { - reduceDescDiff(queryDescs, trainDescs, desc_len, dist, sdiff_row); - } - - const T* queryDescs; - }; - template - struct ReduceDescCalculatorCached + template + __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& train, const Mask& m, const VecDiff& vecDiff, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) { - __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem) - { - loadDescsVals(queryDescs, desc_len, queryVals, smem); - __syncthreads(); - } - - template - __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const - { - reduceDescDiffCached(queryVals, trainDescs, desc_len, dist, sdiff_row); - } - - U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X]; - }; - - /////////////////////////////////////////////////////////////////////////////// - // matchDescs loop - - template - __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_& trainDescs_, - const Mask& m, const ReduceDescCalculator& reduceDescCalc, - typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) - { - for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y) + for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) { if (m(queryIdx, trainIdx)) { - const T* trainDescs = trainDescs_.ptr(trainIdx); + const T* trainDescs = train.ptr(trainIdx); Dist dist; - reduceDescCalc.calc(trainDescs, trainDescs_.cols, dist, sdiff_row); + vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x); - if (threadIdx.x == 0) + const typename Dist::result_type res = dist; + + if (res < myDist) { - if (dist < myMin) - { - myMin = dist; - myBestTrainIdx = trainIdx; - myBestImgIdx = imgIdx; - } + myDist = res; + myIdx.x = trainIdx; + myIdx.y = imgIdx; } } } } - /////////////////////////////////////////////////////////////////////////////// - // Train collection loop strategy - template struct SingleTrain { - explicit SingleTrain(const DevMem2D_& trainDescs_) : trainDescs(trainDescs_) + explicit SingleTrain(const DevMem2D_& train_) : train(train_) { } - template - __device__ __forceinline__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const + template + __device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const { - matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); + matchDescs(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row); } __device__ __forceinline__ int desc_len() const { - return trainDescs.cols; + return train.cols; } - const DevMem2D_ trainDescs; + static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, + float myDist, const int2& myIdx, int queryIdx) + { + trainIdx[queryIdx] = myIdx.x; + distance[queryIdx] = myDist; + } + + const DevMem2D_ train; }; template struct TrainCollection @@ -506,15 +128,15 @@ namespace cv { namespace gpu { namespace bfmatcher { } - template - __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, - typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const + template + __device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const { for (int imgIdx = 0; imgIdx < nImg; ++imgIdx) { - const DevMem2D_ trainDescs = trainCollection[imgIdx]; - m.nextMask(); - matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); + const DevMem2D_ train = trainCollection[imgIdx]; + m.next(); + matchDescs(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row); } } @@ -523,84 +145,93 @@ namespace cv { namespace gpu { namespace bfmatcher return desclen; } + static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx, + float myDist, const int2& myIdx, int queryIdx) + { + trainIdx[queryIdx] = myIdx.x; + imgIdx[queryIdx] = myIdx.y; + distance[queryIdx] = myDist; + } + const DevMem2D_* trainCollection; - int nImg; - int desclen; + const int nImg; + const int desclen; }; - /////////////////////////////////////////////////////////////////////////////// - // Match kernel - - template - __global__ void match(const PtrStep_ queryDescs_, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) + template + __device__ void distanceCalcLoop(const PtrStep_& query, const Train& train, const Mask& mask, int queryIdx, + typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem) { - __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - - const int queryIdx = blockIdx.x; - - int myBestTrainIdx = -1; - int myBestImgIdx = -1; - typename Dist::ResultType myMin = numeric_limits::max(); + const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); + + typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y; - { - typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; + Mask m = mask; - Mask m = mask; + myIdx.x = -1; + myIdx.y = -1; + myDist = numeric_limits::max(); - ReduceDescCalculator reduceDescCalc; + train.template loop(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row); + } - reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), (typename Dist::ValueType*)smem); + template + __global__ void match(const PtrStep_ query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* distance) + { + __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - train.template loop(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); - } + const int queryIdx = blockIdx.x; + + int2 myIdx; + typename Dist::result_type myDist; + + distanceCalcLoop(query, train, mask, queryIdx, myDist, myIdx, smem); __syncthreads(); - typename Dist::ResultType* smin = smem; - int* strainIdx = (int*)(smin + BLOCK_DIM_Y); - int* simgIdx = strainIdx + BLOCK_DIM_Y; + typename Dist::result_type* smin = smem; + int2* sIdx = (int2*)(smin + BLOCK_DIM_Y); - findBestMatch(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx); + findBestMatch(myDist, myIdx, smin, sIdx); if (threadIdx.x == 0 && threadIdx.y == 0) - { - imgIdx[queryIdx] = myBestImgIdx; - trainIdx[queryIdx] = myBestTrainIdx; - distance[queryIdx] = myMin; - } + Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx); } - + /////////////////////////////////////////////////////////////////////////////// - // Match kernel callers + // Match kernel caller template - void matchSimple_caller(const DevMem2D_& queryDescs, const Train& train, - const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream) + void matchSimple_caller(const DevMem2D_& query, const Train& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) { StaticAssert::check(); // blockDimY vals must reduce by warp - dim3 grid(queryDescs.rows, 1, 1); - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, Dist, T> - <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); + match, Dist, T> + <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template - void matchCached_caller(const DevMem2D_& queryDescs, const Train& train, - const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream) + + template + void matchCached_caller(const DevMem2D_& query, const Train& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) { - StaticAssert::check(); // blockDimY vals must reduce by warp - StaticAssert= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length - StaticAssert::check(); // max descriptors length must divide to blockDimX + StaticAssert::check(); // blockDimY vals must reduce by warp + StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length + StaticAssert::check(); // max descriptors length must divide to blockDimX - dim3 grid(queryDescs.rows, 1, 1); - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - match, Dist, T> - <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); + match, Dist, T> + <<>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -608,187 +239,193 @@ namespace cv { namespace gpu { namespace bfmatcher } /////////////////////////////////////////////////////////////////////////////// - // Match caller + // Match Dispatcher template - void matchDispatcher(const DevMem2D_& queryDescs, const Train& train, - const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream) + void matchDispatcher(const DevMem2D_& query, const Train& train, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - if (queryDescs.cols < 64) - matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); - else if (queryDescs.cols == 64) - matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); - else if (queryDescs.cols < 128) - matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); - else if (queryDescs.cols == 128 && cc_12) - matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); - else if (queryDescs.cols < 256 && cc_12) - matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); - else if (queryDescs.cols == 256 && cc_12) - matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); + if (query.cols < 64) + { + matchCached_caller<16, 16, 64, false, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols == 64) + { + matchCached_caller<16, 16, 64, true, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols < 128) + { + matchCached_caller<16, 16, 128, false, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols == 128 && cc >= 12) + { + matchCached_caller<16, 16, 128, true, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols < 256 && cc >= 12) + { + matchCached_caller<16, 16, 256, false, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } + else if (query.cols == 256 && cc >= 12) + { + matchCached_caller<16, 16, 256, true, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } else - matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream); + { + matchSimple_caller<16, 16, Dist>( + query, train, mask, + static_cast(trainIdx), static_cast(imgIdx), static_cast(distance), + stream); + } } + + /////////////////////////////////////////////////////////////////////////////// + // Match caller - template - void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream) + template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - SingleTrain train((DevMem2D_)trainDescs); + SingleTrain train(static_cast< DevMem2D_ >(train_)); if (mask.data) - { - SingleMask m(mask); - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); else - { - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); } - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - - template - void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream) + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - SingleTrain train((DevMem2D_)trainDescs); + SingleTrain train(static_cast< DevMem2D_ >(train_)); if (mask.data) - { - SingleMask m(mask); - matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); else - { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); } - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - - template - void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - bool cc_12, cudaStream_t stream) + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - SingleTrain train((DevMem2D_)trainDescs); + SingleTrain train(static_cast< DevMem2D_ >(train_)); if (mask.data) - { - SingleMask m(mask); - matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream); else - { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream); } - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream); - template - void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12, cudaStream_t stream) + template void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); if (maskCollection.data) - { - MaskCollection mask(maskCollection.data); - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); else - { - matchDispatcher< L1Dist >((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); } - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - - template - void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12, cudaStream_t stream) + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); if (maskCollection.data) - { - MaskCollection mask(maskCollection.data); - matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); else - { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); } - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - - template - void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance, bool cc_12, cudaStream_t stream) + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + + template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, + const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { - TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); + TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, query.cols); if (maskCollection.data) - { - MaskCollection mask(maskCollection.data); - matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream); else - { - matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream); - } + matchDispatcher(static_cast< DevMem2D_ >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); } - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); - template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); + template void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); /////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// Knn Match //////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - template + template __device__ void distanceCalcLoop(const PtrStep_& query, const DevMem2D_& train, const Mask& m, int queryIdx, - typename Dist::ResultType& distMin1, typename Dist::ResultType& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, - typename Dist::ResultType* smem) + typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, + typename Dist::result_type* smem) { - ReduceDescCalculator reduceDescCalc; - - reduceDescCalc.prepare(query.ptr(queryIdx), train.cols, (typename Dist::ValueType*)smem); + const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x); + + typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y; - typename Dist::ResultType* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y; + distMin1 = numeric_limits::max(); + distMin2 = numeric_limits::max(); - for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y) + bestTrainIdx1 = -1; + bestTrainIdx2 = -1; + + for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y) { if (m(queryIdx, trainIdx)) { @@ -796,48 +433,44 @@ namespace cv { namespace gpu { namespace bfmatcher const T* trainRow = train.ptr(trainIdx); - reduceDescCalc.calc(trainRow, train.cols, dist, sdiffRow); + vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); - if (threadIdx.x == 0) - { - typename Dist::ResultType val = dist; + const typename Dist::result_type val = dist; - if (val < distMin1) - { - distMin1 = val; - bestTrainIdx1 = trainIdx; - } - else if (val < distMin2) - { - distMin2 = val; - bestTrainIdx2 = trainIdx; - } + if (val < distMin1) + { + distMin1 = val; + bestTrainIdx1 = trainIdx; + } + else if (val < distMin2) + { + distMin2 = val; + bestTrainIdx2 = trainIdx; } } } } - template + template __global__ void knnMatch2(const PtrStep_ query, const DevMem2D_ train, const Mask m, PtrStep_ trainIdx, PtrStep_ distance) { - typedef typename Dist::ResultType ResultType; - typedef typename Dist::ValueType ValueType; + typedef typename Dist::result_type result_type; + typedef typename Dist::value_type value_type; - __shared__ ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; const int queryIdx = blockIdx.x; - ResultType distMin1 = numeric_limits::max(); - ResultType distMin2 = numeric_limits::max(); + result_type distMin1; + result_type distMin2; - int bestTrainIdx1 = -1; - int bestTrainIdx2 = -1; + int bestTrainIdx1; + int bestTrainIdx2; - distanceCalcLoop(query, train, m, queryIdx, - distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); + distanceCalcLoop(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem); __syncthreads(); - volatile ResultType* sdistMinRow = smem; + volatile result_type* sdistMinRow = smem; volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y); if (threadIdx.x == 0) @@ -852,8 +485,8 @@ namespace cv { namespace gpu { namespace bfmatcher if (threadIdx.x == 0 && threadIdx.y == 0) { - distMin1 = numeric_limits::max(); - distMin2 = numeric_limits::max(); + distMin1 = numeric_limits::max(); + distMin2 = numeric_limits::max(); bestTrainIdx1 = -1; bestTrainIdx2 = -1; @@ -861,7 +494,7 @@ namespace cv { namespace gpu { namespace bfmatcher #pragma unroll for (int i = 0; i < BLOCK_DIM_Y; ++i) { - ResultType val = sdistMinRow[i]; + result_type val = sdistMinRow[i]; if (val < distMin1) { @@ -878,7 +511,7 @@ namespace cv { namespace gpu { namespace bfmatcher #pragma unroll for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i) { - ResultType val = sdistMinRow[i]; + result_type val = sdistMinRow[i]; if (val < distMin2) { @@ -892,87 +525,131 @@ namespace cv { namespace gpu { namespace bfmatcher } } + /////////////////////////////////////////////////////////////////////////////// + // Knn 2 Match kernel caller + template - void knnMatch2Simple_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, cudaStream_t stream) + void knnMatch2Simple_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) { - dim3 grid(queryDescs.rows, 1, 1); - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - knnMatch2, T> - <<>>(queryDescs, trainDescs, mask, trainIdx, distance); + knnMatch2, Dist, T> + <<>>(query, train, mask, trainIdx, distance); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template - void knnMatch2Cached_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, cudaStream_t stream) + + template + void knnMatch2Cached_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) { - StaticAssert= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length - StaticAssert::check(); // max descriptors length must divide to blockDimX + StaticAssert= MAX_LEN>::check(); // block size must be greter than descriptors length + StaticAssert::check(); // max descriptors length must divide to blockDimX - dim3 grid(queryDescs.rows, 1, 1); - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - knnMatch2, T> - <<>>(queryDescs, trainDescs, mask, trainIdx, distance); + knnMatch2, Dist, T> + <<>>(query, train, mask, trainIdx, distance); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } + + /////////////////////////////////////////////////////////////////////////////// + // Knn 2 Match Dispatcher template void knnMatch2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, bool cc_12, cudaStream_t stream) + const DevMem2D& trainIdx, const DevMem2D& distance, + int cc, cudaStream_t stream) { if (query.cols < 64) - knnMatch2Cached_caller<16, 16, 64, false, Dist>(query, train, mask, trainIdx, distance, stream); + { + knnMatch2Cached_caller<16, 16, 64, false, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } else if (query.cols == 64) - knnMatch2Cached_caller<16, 16, 64, true, Dist>(query, train, mask, trainIdx, distance, stream); + { + knnMatch2Cached_caller<16, 16, 64, true, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } else if (query.cols < 128) - knnMatch2Cached_caller<16, 16, 128, false, Dist>(query, train, mask, trainIdx, distance, stream); - else if (query.cols == 128 && cc_12) - knnMatch2Cached_caller<16, 16, 128, true, Dist>(query, train, mask, trainIdx, distance, stream); - else if (query.cols < 256 && cc_12) - knnMatch2Cached_caller<16, 16, 256, false, Dist>(query, train, mask, trainIdx, distance, stream); - else if (query.cols == 256 && cc_12) - knnMatch2Cached_caller<16, 16, 256, true, Dist>(query, train, mask, trainIdx, distance, stream); + { + knnMatch2Cached_caller<16, 16, 128, false, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols == 128 && cc >= 12) + { + knnMatch2Cached_caller<16, 16, 128, true, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols < 256 && cc >= 12) + { + knnMatch2Cached_caller<16, 16, 256, false, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } + else if (query.cols == 256 && cc >= 12) + { + knnMatch2Cached_caller<16, 16, 256, true, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } else - knnMatch2Simple_caller<16, 16, Dist>(query, train, mask, trainIdx, distance, stream); + { + knnMatch2Simple_caller<16, 16, Dist>( + query, train, mask, + static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(distance), + stream); + } } /////////////////////////////////////////////////////////////////////////////// // Calc distance kernel template - __global__ void calcDistance(const PtrStep_ queryDescs_, const DevMem2D_ trainDescs_, const Mask mask, PtrStepf distance) + __global__ void calcDistance(const PtrStep_ query, const DevMem2D_ train, const Mask mask, PtrStepf distance) { - __shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y]; - typename Dist::ResultType* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; + typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y; const int queryIdx = blockIdx.x; - const T* queryDescs = queryDescs_.ptr(queryIdx); + const T* queryDescs = query.ptr(queryIdx); const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - if (trainIdx < trainDescs_.rows) + if (trainIdx < train.rows) { - const T* trainDescs = trainDescs_.ptr(trainIdx); + const T* trainDescs = train.ptr(trainIdx); - typename Dist::ResultType myDist = numeric_limits::max(); + typename Dist::result_type myDist = numeric_limits::max(); if (mask(queryIdx, trainIdx)) { Dist dist; - reduceDescDiff(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row); + calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); - if (threadIdx.x == 0) - myDist = dist; + myDist = dist; } if (threadIdx.x == 0) @@ -984,150 +661,24 @@ namespace cv { namespace gpu { namespace bfmatcher // Calc distance kernel caller template - void calcDistance_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, - const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) + void calcDistance_caller(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream) { - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); - calcDistance<<>>( - queryDescs, trainDescs, mask, distance); + calcDistance<<>>(query, train, mask, distance); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - - /////////////////////////////////////////////////////////////////////////////// - // warpReduceMinIdx - template - __device__ void warpReduceMinIdx(volatile T* sdist, volatile int* strainIdx, T& myMin, int tid) + template + void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream) { - if (tid < 32) - { - if (BLOCK_SIZE >= 64) - { - T reg = sdist[tid + 32]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 32]; - } - } - if (BLOCK_SIZE >= 32) - { - T reg = sdist[tid + 16]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 16]; - } - } - if (BLOCK_SIZE >= 16) - { - T reg = sdist[tid + 8]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 8]; - } - } - if (BLOCK_SIZE >= 8) - { - T reg = sdist[tid + 4]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 4]; - } - } - if (BLOCK_SIZE >= 4) - { - T reg = sdist[tid + 2]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 2]; - } - } - if (BLOCK_SIZE >= 2) - { - T reg = sdist[tid + 1]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 1]; - } - } - } + calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast(allDist), stream); } - - template - __device__ void reduceMinIdx(const T* dist, int n, T* sdist, int* strainIdx) - { - const int tid = threadIdx.x; - - T myMin = numeric_limits::max(); - int myMinIdx = -1; - - for (int i = tid; i < n; i += BLOCK_SIZE) - { - T reg = dist[i]; - if (reg < myMin) - { - myMin = reg; - myMinIdx = i; - } - } - - sdist[tid] = myMin; - strainIdx[tid] = myMinIdx; - __syncthreads(); - - if (BLOCK_SIZE >= 512 && tid < 256) - { - T reg = sdist[tid + 256]; - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 256]; - } - __syncthreads(); - } - if (BLOCK_SIZE >= 256 && tid < 128) - { - T reg = sdist[tid + 128]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 128]; - } - __syncthreads(); - } - if (BLOCK_SIZE >= 128 && tid < 64) - { - T reg = sdist[tid + 64]; - - if (reg < myMin) - { - sdist[tid] = myMin = reg; - strainIdx[tid] = strainIdx[tid + 64]; - } - __syncthreads(); - } - - warpReduceMinIdx(sdist, strainIdx, myMin, tid); - } - /////////////////////////////////////////////////////////////////////////////// // find knn match kernel @@ -1143,14 +694,29 @@ namespace cv { namespace gpu { namespace bfmatcher int* trainIdx = trainIdx_.ptr(queryIdx); float* distance = distance_.ptr(queryIdx); - reduceMinIdx(allDist, allDist_.cols, sdist, strainIdx); + float dist = numeric_limits::max(); + int bestIdx = -1; + + for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE) + { + float reg = allDist[i]; + if (reg < dist) + { + dist = reg; + bestIdx = i; + } + } + + sdist[threadIdx.x] = dist; + strainIdx[threadIdx.x] = bestIdx; + __syncthreads(); + + reducePredVal(sdist, dist, strainIdx, bestIdx, threadIdx.x, less()); if (threadIdx.x == 0) { - float dist = sdist[0]; if (dist < numeric_limits::max()) { - int bestIdx = strainIdx[0]; allDist[bestIdx] = numeric_limits::max(); trainIdx[i] = bestIdx; distance[i] = dist; @@ -1161,13 +727,12 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // find knn match kernel caller - template - void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + template void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) { - dim3 threads(BLOCK_SIZE, 1, 1); - dim3 grid(trainIdx.rows, 1, 1); + const dim3 threads(BLOCK_SIZE, 1, 1); + const dim3 grid(trainIdx.rows, 1, 1); - for (int i = 0; i < knn; ++i) + for (int i = 0; i < k; ++i) { findBestMatch<<>>(allDist, i, trainIdx, distance); cudaSafeCall( cudaGetLastError() ); @@ -1176,121 +741,116 @@ namespace cv { namespace gpu { namespace bfmatcher if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - - /////////////////////////////////////////////////////////////////////////////// - // knn match caller - - template - void calcDistanceDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) - { - calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist, stream); - } - void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream) { - findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream); + findKnnMatch_caller<256>(k, static_cast(trainIdx), static_cast(distance), static_cast(allDist), stream); } + + /////////////////////////////////////////////////////////////////////////////// + // knn match Dispatcher - template < typename Dist, typename T > - void knnMatchDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) + template + void knnMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) { if (mask.data) { - if (knn == 2) + if (k == 2) { - knnMatch2Dispatcher(queryDescs, trainDescs, SingleMask(mask), (DevMem2D_)trainIdx, (DevMem2D_)distance, cc_12, stream); + knnMatch2Dispatcher(query, train, SingleMask(mask), trainIdx, distance, cc, stream); return; } - calcDistanceDispatcher(queryDescs, trainDescs, SingleMask(mask), allDist, stream); + calcDistanceDispatcher(query, train, SingleMask(mask), allDist, stream); } else { - if (knn == 2) + if (k == 2) { - knnMatch2Dispatcher(queryDescs, trainDescs, WithOutMask(), (DevMem2D_)trainIdx, (DevMem2D_)distance, cc_12, stream); + knnMatch2Dispatcher(query, train, WithOutMask(), trainIdx, distance, cc, stream); return; } - calcDistanceDispatcher(queryDescs, trainDescs, WithOutMask(), allDist, stream); + calcDistanceDispatcher(query, train, WithOutMask(), allDist, stream); } - findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream); + findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); } + + /////////////////////////////////////////////////////////////////////////////// + // knn match caller - template - void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) + template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) { - knnMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream); + knnMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); } - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template - void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) + template void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) { - knnMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream); + knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); } - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); - template - void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream) + template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, + int cc, cudaStream_t stream) { - knnMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream); + knnMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, mask, trainIdx, distance, allDist, cc, stream); } - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); - template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); + template void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); /////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////// Radius Match ////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////// - - /////////////////////////////////////////////////////////////////////////////// - // Radius Match kernel template - __global__ void radiusMatch(const PtrStep_ queryDescs_, const DevMem2D_ trainDescs_, - float maxDistance, const Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) + __global__ void radiusMatch(const PtrStep_ query, const DevMem2D_ train, float maxDistance, const Mask mask, + DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + #if __CUDA_ARCH__ >= 110 - __shared__ typename Dist::ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; - typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; + typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; const int queryIdx = blockIdx.x; - const T* queryDescs = queryDescs_.ptr(queryIdx); + const T* queryDescs = query.ptr(queryIdx); const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - if (trainIdx < trainDescs_.rows) + if (trainIdx < train.rows) { - const T* trainDescs = trainDescs_.ptr(trainIdx); + const T* trainDescs = train.ptr(trainIdx); if (mask(queryIdx, trainIdx)) { Dist dist; - reduceDescDiff(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row); + calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); if (threadIdx.x == 0) { @@ -1314,15 +874,14 @@ namespace cv { namespace gpu { namespace bfmatcher // Radius Match kernel caller template - void radiusMatch_caller(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, - float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, - const DevMem2Df& distance, cudaStream_t stream) + void radiusMatch_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2D_& nMatches, const DevMem2Df& distance, + cudaStream_t stream) { - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); - radiusMatch<<>>( - queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance); + radiusMatch<<>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -1330,82 +889,92 @@ namespace cv { namespace gpu { namespace bfmatcher } /////////////////////////////////////////////////////////////////////////////// - // Radius Match caller + // Radius Match Dispatcher template - void radiusMatchDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, - float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, - const DevMem2Df& distance, cudaStream_t stream) + void radiusMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) { - radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, - trainIdx, nMatches, distance, stream); + radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, + static_cast(trainIdx), static_cast< const DevMem2D_ >(nMatches), static_cast(distance), + stream); } + + /////////////////////////////////////////////////////////////////////////////// + // Radius Match caller - template - void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream) + template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) { if (mask.data) { - radiusMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream); + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, nMatches, distance, + stream); } else { - radiusMatchDispatcher< L1Dist >((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream); + radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, nMatches, distance, + stream); } } - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template - void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream) + template void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) { if (mask.data) { - radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream); + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, nMatches, distance, + stream); } else { - radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream); + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, nMatches, distance, + stream); } } - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); - template - void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, - const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream) + template void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, + const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + cudaStream_t stream) { if (mask.data) { - radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream); + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, nMatches, distance, + stream); } else { - radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, - maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream); + radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, nMatches, distance, + stream); } } - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); - template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); + template void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); }}} diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index a22077e..709d62b 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -566,8 +566,8 @@ namespace cv { namespace gpu { namespace surf float* s_sum_row = s_sum + threadIdx.y * 32; - warpReduce32(s_sum_row, sumx, threadIdx.x, plus()); - warpReduce32(s_sum_row, sumy, threadIdx.x, plus()); + reduce<32>(s_sum_row, sumx, threadIdx.x, plus()); + reduce<32>(s_sum_row, sumy, threadIdx.x, plus()); const float temp_mod = sumx * sumx + sumy * sumy; if (temp_mod > best_mod) diff --git a/modules/gpu/src/opencv2/gpu/device/color.hpp b/modules/gpu/src/opencv2/gpu/device/color.hpp index d620ead..f6bdde9 100644 --- a/modules/gpu/src/opencv2/gpu/device/color.hpp +++ b/modules/gpu/src/opencv2/gpu/device/color.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_COLOR_HPP__ #define __OPENCV_GPU_COLOR_HPP__ -#include "detail/color.hpp" +#include "detail/color_detail.hpp" namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp index 407aea2..c8937c1 100644 --- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp +++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp @@ -44,7 +44,14 @@ #define __OPENCV_GPU_DATAMOV_UTILS_HPP__ #include "internal_shared.hpp" -#include "utility.hpp" + +#if defined(_WIN64) || defined(__LP64__) + // 64-bit register modifier for inlined asm + #define OPENCV_GPU_ASM_PTR "l" +#else + // 32-bit register modifier for inlined asm + #define OPENCV_GPU_ASM_PTR "r" +#endif namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/src/opencv2/gpu/device/detail/color.hpp b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp similarity index 100% rename from modules/gpu/src/opencv2/gpu/device/detail/color.hpp rename to modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp similarity index 100% rename from modules/gpu/src/opencv2/gpu/device/detail/transform.hpp rename to modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp diff --git a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp new file mode 100644 index 0000000..f6acce1 --- /dev/null +++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp @@ -0,0 +1,186 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ +#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ + +#include "../vec_traits.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace detail + { + template struct Select { typedef T1 type; }; + template struct Select { typedef T2 type; }; + + template struct IsSignedIntergral { enum {value = 0}; }; + template <> struct IsSignedIntergral { enum {value = 1}; }; + template <> struct IsSignedIntergral { enum {value = 1}; }; + template <> struct IsSignedIntergral { enum {value = 1}; }; + template <> struct IsSignedIntergral { enum {value = 1}; }; + template <> struct IsSignedIntergral { enum {value = 1}; }; + template <> struct IsSignedIntergral { enum {value = 1}; }; + + template struct IsUnsignedIntegral { enum {value = 0}; }; + template <> struct IsUnsignedIntegral { enum {value = 1}; }; + template <> struct IsUnsignedIntegral { enum {value = 1}; }; + template <> struct IsUnsignedIntegral { enum {value = 1}; }; + template <> struct IsUnsignedIntegral { enum {value = 1}; }; + template <> struct IsUnsignedIntegral { enum {value = 1}; }; + template <> struct IsUnsignedIntegral { enum {value = 1}; }; + + template struct IsIntegral { enum {value = IsSignedIntergral::value || IsUnsignedIntegral::value}; }; + template <> struct IsIntegral { enum {value = 1}; }; + template <> struct IsIntegral { enum {value = 1}; }; + + template struct IsFloat { enum {value = 0}; }; + template <> struct IsFloat { enum {value = 1}; }; + template <> struct IsFloat { enum {value = 1}; }; + + template struct IsVec { enum {value = 0}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + template <> struct IsVec { enum {value = 1}; }; + + template struct AddParameterType { typedef const U& type; }; + template struct AddParameterType { typedef U& type; }; + template <> struct AddParameterType { typedef void type; }; + + template struct ReferenceTraits + { + enum { value = false }; + typedef U type; + }; + template struct ReferenceTraits + { + enum { value = true }; + typedef U type; + }; + + template struct PointerTraits + { + enum { value = false }; + typedef void type; + }; + template struct PointerTraits + { + enum { value = true }; + typedef U type; + }; + template struct PointerTraits + { + enum { value = true }; + typedef U type; + }; + + template struct UnConst + { + typedef U type; + enum { value = 0 }; + }; + template struct UnConst + { + typedef U type; + enum { value = 1 }; + }; + template struct UnConst + { + typedef U& type; + enum { value = 1 }; + }; + + template struct UnVolatile + { + typedef U type; + enum { value = 0 }; + }; + template struct UnVolatile + { + typedef U type; + enum { value = 1 }; + }; + template struct UnVolatile + { + typedef U& type; + enum { value = 1 }; + }; + } +}}} + +#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp new file mode 100644 index 0000000..de3f3f7 --- /dev/null +++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp @@ -0,0 +1,576 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__ +#define __OPENCV_GPU_UTILITY_DETAIL_HPP__ + +namespace cv { namespace gpu { namespace device +{ + namespace detail + { + /////////////////////////////////////////////////////////////////////////////// + // Reduction + + template struct WarpReductor + { + template static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + if (tid < n) + data[tid] = partial_reduction; + if (n > 32) __syncthreads(); + + if (n > 32) + { + if (tid < n - 32) + data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); + if (tid < 16) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); + } + } + else if (n > 16) + { + if (tid < n - 16) + data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); + if (tid < 8) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); + } + } + else if (n > 8) + { + if (tid < n - 8) + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); + if (tid < 4) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); + } + } + else if (n > 4) + { + if (tid < n - 4) + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); + if (tid < 2) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); + } + } + else if (n > 2) + { + if (tid < n - 2) + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); + if (tid < 2) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); + } + } + } + }; + template <> struct WarpReductor<64> + { + template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + data[tid] = partial_reduction; + __syncthreads(); + + if (tid < 32) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + } + } + }; + template <> struct WarpReductor<32> + { + template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + data[tid] = partial_reduction; + + if (tid < 16) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + } + } + }; + template <> struct WarpReductor<16> + { + template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + data[tid] = partial_reduction; + + if (tid < 8) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + } + } + }; + template <> struct WarpReductor<8> + { + template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + data[tid] = partial_reduction; + + if (tid < 4) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + } + } + }; + + template struct ReductionDispatcher; + template <> struct ReductionDispatcher + { + template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + WarpReductor::reduce(data, partial_reduction, tid, op); + } + }; + template <> struct ReductionDispatcher + { + template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + if (tid < n) + data[tid] = partial_reduction; + __syncthreads(); + + + if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); } + if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); } + if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); } + + if (tid < 32) + { + data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); + } + } + }; + + + template struct PredValWarpReductor; + template <> struct PredValWarpReductor<64> + { + template + static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) + { + if (tid < 32) + { + myData = sdata[tid]; + myVal = sval[tid]; + + T reg = sdata[tid + 32]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 32]; + } + + reg = sdata[tid + 16]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 16]; + } + + reg = sdata[tid + 8]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 8]; + } + + reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 1]; + } + } + } + }; + template <> struct PredValWarpReductor<32> + { + template + static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) + { + if (tid < 16) + { + myData = sdata[tid]; + myVal = sval[tid]; + + T reg = sdata[tid + 16]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 16]; + } + + reg = sdata[tid + 8]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 8]; + } + + reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 1]; + } + } + } + }; + + template <> struct PredValWarpReductor<16> + { + template + static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) + { + if (tid < 8) + { + myData = sdata[tid]; + myVal = sval[tid]; + + T reg = reg = sdata[tid + 8]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 8]; + } + + reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 1]; + } + } + } + }; + template <> struct PredValWarpReductor<8> + { + template + static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) + { + if (tid < 4) + { + myData = sdata[tid]; + myVal = sval[tid]; + + T reg = reg = sdata[tid + 4]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 4]; + } + + reg = sdata[tid + 2]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 2]; + } + + reg = sdata[tid + 1]; + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 1]; + } + } + } + }; + + template struct PredValReductionDispatcher; + template <> struct PredValReductionDispatcher + { + template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) + { + PredValWarpReductor::reduce(myData, myVal, sdata, sval, tid, pred); + } + }; + template <> struct PredValReductionDispatcher + { + template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) + { + myData = sdata[tid]; + myVal = sval[tid]; + + if (n >= 512 && tid < 256) + { + T reg = sdata[tid + 256]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 256]; + } + __syncthreads(); + } + if (n >= 256 && tid < 128) + { + T reg = sdata[tid + 128]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 128]; + } + __syncthreads(); + } + if (n >= 128 && tid < 64) + { + T reg = sdata[tid + 64]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 64]; + } + __syncthreads(); + } + + if (tid < 32) + { + if (n >= 64) + { + T reg = sdata[tid + 32]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 32]; + } + } + if (n >= 32) + { + T reg = sdata[tid + 16]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 16]; + } + } + if (n >= 16) + { + T reg = sdata[tid + 8]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 8]; + } + } + if (n >= 8) + { + T reg = sdata[tid + 4]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 4]; + } + } + if (n >= 4) + { + T reg = sdata[tid + 2]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 2]; + } + } + if (n >= 2) + { + T reg = sdata[tid + 1]; + + if (pred(reg, myData)) + { + sdata[tid] = myData = reg; + sval[tid] = myVal = sval[tid + 1]; + } + } + } + } + }; + + /////////////////////////////////////////////////////////////////////////////// + // Vector Distance + + template struct UnrollVecDiffCached + { + template + static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind) + { + if (ind < len) + { + T1 val1 = *vecCached++; + + T2 val2; + ForceGlob::Load(vecGlob, ind, val2); + + dist.reduceIter(val1, val2); + + UnrollVecDiffCached::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM); + } + } + + template + static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist) + { + T1 val1 = *vecCached++; + + T2 val2; + ForceGlob::Load(vecGlob, 0, val2); + vecGlob += THREAD_DIM; + + dist.reduceIter(val1, val2); + + UnrollVecDiffCached::calcWithoutCheck(vecCached, vecGlob, dist); + } + }; + template struct UnrollVecDiffCached + { + template + static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int) + { + } + + template + static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&) + { + } + }; + + template struct VecDiffCachedCalculator; + template struct VecDiffCachedCalculator + { + template + static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid) + { + UnrollVecDiffCached::calcCheck(vecCached, vecGlob, len, dist, tid); + } + }; + template struct VecDiffCachedCalculator + { + template + static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid) + { + UnrollVecDiffCached::calcWithoutCheck(vecCached, vecGlob + tid, dist); + } + }; + } +}}} + +#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index 151c03a..f9c8d81 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -55,7 +55,7 @@ namespace cv { #if __CUDA_ARCH__ >= 200 (void)cta_buffer; - return __ballot(predicat); + return __ballot(predicate); #else int tid = threadIdx.x; cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp index be3ea7d..58af91d 100644 --- a/modules/gpu/src/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp @@ -47,6 +47,7 @@ #include "internal_shared.hpp" #include "saturate_cast.hpp" #include "vec_traits.hpp" +#include "type_traits.hpp" namespace cv { namespace gpu { namespace device { @@ -57,55 +58,188 @@ namespace cv { namespace gpu { namespace device // Arithmetic Operations - using thrust::plus; - using thrust::minus; - using thrust::multiplies; - using thrust::divides; - using thrust::modulus; - using thrust::negate; + template struct plus : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a + b; + } + }; + template struct minus : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a - b; + } + }; + template struct multiplies : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a * b; + } + }; + template struct divides : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a / b; + } + }; + template struct modulus : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a % b; + } + }; + template struct negate : unary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a) const + { + return -a; + } + }; // Comparison Operations - using thrust::equal_to; - using thrust::not_equal_to; - using thrust::greater; - using thrust::less; - using thrust::greater_equal; - using thrust::less_equal; + template struct equal_to : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a == b; + } + }; + template struct not_equal_to : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a != b; + } + }; + template struct greater : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a > b; + } + }; + template struct less : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a < b; + } + }; + template struct greater_equal : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a >= b; + } + }; + template struct less_equal : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a <= b; + } + }; // Logical Operations - using thrust::logical_and; - using thrust::logical_or; - using thrust::logical_not; + template struct logical_and : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a && b; + } + }; + template struct logical_or : binary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a || b; + } + }; + template struct logical_not : unary_function + { + __device__ __forceinline__ bool operator ()(typename TypeTraits::ParameterType a) const + { + return !a; + } + }; // Bitwise Operations - using thrust::bit_and; - using thrust::bit_or; - using thrust::bit_xor; + template struct bit_and : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a & b; + } + }; + template struct bit_or : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a | b; + } + }; + template struct bit_xor : binary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType a, typename TypeTraits::ParameterType b) const + { + return a ^ b; + } + }; template struct bit_not : unary_function { - __forceinline__ __device__ T operator ()(const T& v) const {return ~v;} + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType v) const + { + return ~v; + } }; // Generalized Identity Operations - using thrust::identity; - using thrust::project1st; - using thrust::project2nd; + template struct identity : unary_function + { + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType x) const + { + return x; + } + }; + + template struct project1st : binary_function + { + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + { + return lhs; + } + }; + template struct project2nd : binary_function + { + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + { + return rhs; + } + }; // Min/Max Operations #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \ template <> struct name : binary_function \ { \ - __forceinline__ __device__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ + __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ }; template struct maximum : binary_function { - __forceinline__ __device__ T operator()(const T& lhs, const T& rhs) const {return lhs < rhs ? rhs : lhs;} + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + { + return lhs < rhs ? rhs : lhs; + } }; OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max) @@ -119,7 +253,10 @@ namespace cv { namespace gpu { namespace device template struct minimum : binary_function { - __forceinline__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;} + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + { + return lhs < rhs ? lhs : rhs; + } }; OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min) @@ -138,14 +275,14 @@ namespace cv { namespace gpu { namespace device #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \ template struct func ## _func : unary_function \ { \ - __forceinline__ __device__ float operator ()(const T& v) const \ + __device__ __forceinline__ float operator ()(typename TypeTraits::ParameterType v) const \ { \ return func ## f(v); \ } \ }; \ template <> struct func ## _func : unary_function \ { \ - __forceinline__ __device__ double operator ()(double v) const \ + __device__ __forceinline__ double operator ()(double v) const \ { \ return func(v); \ } \ @@ -153,14 +290,14 @@ namespace cv { namespace gpu { namespace device #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \ template struct func ## _func : binary_function \ { \ - __forceinline__ __device__ float operator ()(const T& v1, const T& v2) const \ + __device__ __forceinline__ float operator ()(typename TypeTraits::ParameterType v1, typename TypeTraits::ParameterType v2) const \ { \ return func ## f(v1, v2); \ } \ }; \ template <> struct func ## _func : binary_function \ { \ - __forceinline__ __device__ double operator ()(double v1, double v2) const \ + __device__ __forceinline__ double operator ()(double v1, double v2) const \ { \ return func(v1, v2); \ } \ @@ -196,7 +333,7 @@ namespace cv { namespace gpu { namespace device template struct hypot_sqr_func : binary_function { - __forceinline__ __device__ T operator ()(T src1, T src2) const + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType src1, typename TypeTraits::ParameterType src2) const { return src1 * src1 + src2 * src2; } @@ -206,7 +343,7 @@ namespace cv { namespace gpu { namespace device template struct saturate_cast_func : unary_function { - __forceinline__ __device__ D operator ()(const T& v) const + __device__ __forceinline__ D operator ()(typename TypeTraits::ParameterType v) const { return saturate_cast(v); } @@ -216,11 +353,11 @@ namespace cv { namespace gpu { namespace device template struct thresh_binary_func : unary_function { - __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} + __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} - __forceinline__ __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType src) const { - return src > thresh ? maxVal : 0; + return (src > thresh) * maxVal; } const T thresh; @@ -228,11 +365,11 @@ namespace cv { namespace gpu { namespace device }; template struct thresh_binary_inv_func : unary_function { - __forceinline__ __host__ __device__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} + __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} - __forceinline__ __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType src) const { - return src > thresh ? 0 : maxVal; + return (src <= thresh) * maxVal; } const T thresh; @@ -240,9 +377,9 @@ namespace cv { namespace gpu { namespace device }; template struct thresh_trunc_func : unary_function { - explicit __forceinline__ __host__ __device__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} + explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} - __forceinline__ __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType src) const { return minimum()(src, thresh); } @@ -251,22 +388,22 @@ namespace cv { namespace gpu { namespace device }; template struct thresh_to_zero_func : unary_function { - explicit __forceinline__ __host__ __device__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} + explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} - __forceinline__ __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType src) const { - return src > thresh ? src : 0; + return (src > thresh) * src; } const T thresh; }; template struct thresh_to_zero_inv_func : unary_function { - explicit __forceinline__ __host__ __device__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} + explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} - __forceinline__ __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType src) const { - return src > thresh ? 0 : src; + return (src <= thresh) * src; } const T thresh; @@ -274,17 +411,43 @@ namespace cv { namespace gpu { namespace device // Function Object Adaptors - using thrust::unary_negate; - using thrust::not1; + template struct unary_negate : unary_function + { + explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {} + + __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x) const + { + return !pred(x); + } + + const Predicate pred; + }; + template __host__ __device__ __forceinline__ unary_negate not1(const Predicate& pred) + { + return unary_negate(pred); + } - using thrust::binary_negate; - using thrust::not2; + template struct binary_negate : binary_function + { + explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {} + + __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x, typename TypeTraits::ParameterType y) const + { + return !pred(x,y); + } + + const Predicate pred; + }; + template __host__ __device__ __forceinline__ binary_negate not2(const BinaryPredicate& pred) + { + return binary_negate(pred); + } template struct binder1st : unary_function { - __forceinline__ __host__ __device__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {} + __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {} - __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a) const + __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits::ParameterType a) const { return op(arg1, a); } @@ -292,15 +455,16 @@ namespace cv { namespace gpu { namespace device const Op op; const typename Op::first_argument_type arg1; }; - template static __forceinline__ __host__ __device__ binder1st bind1st(const Op& op, const T& x) + template __host__ __device__ __forceinline__ binder1st bind1st(const Op& op, const T& x) { return binder1st(op, typename Op::first_argument_type(x)); } + template struct binder2nd : unary_function { - __forceinline__ __host__ __device__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {} + __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {} - __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a) const + __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits::ParameterType a) const { return op(a, arg2); } @@ -308,7 +472,7 @@ namespace cv { namespace gpu { namespace device const Op op; const typename Op::second_argument_type arg2; }; - template static __forceinline__ __host__ __device__ binder2nd bind2nd(const Op& op, const T& x) + template __host__ __device__ __forceinline__ binder2nd bind2nd(const Op& op, const T& x) { return binder2nd(op, typename Op::second_argument_type(x)); } @@ -317,24 +481,28 @@ namespace cv { namespace gpu { namespace device template struct IsUnaryFunction { - struct Yes {}; + typedef char Yes; struct No {Yes a[2];}; - template static Yes check(unary_function*); + template static Yes check(unary_function); static No check(...); - enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) }; + static F makeF(); + + enum { value = (sizeof(check(makeF())) == sizeof(Yes)) }; }; template struct IsBinaryFunction { - struct Yes {}; + typedef char Yes; struct No {Yes a[2];}; - template static Yes check(binary_function*); + template static Yes check(binary_function); static No check(...); - enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) }; + static F makeF(); + + enum { value = (sizeof(check(makeF())) == sizeof(Yes)) }; }; namespace detail diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp index 55c9cb9..ca6159f 100644 --- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp @@ -47,29 +47,29 @@ namespace cv { namespace gpu { namespace device { - template static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); } - template static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); } + template __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); } - template<> static __device__ __forceinline__ uchar saturate_cast(schar v) + template<> __device__ __forceinline__ uchar saturate_cast(schar v) { return (uchar)max((int)v, 0); } - template<> static __device__ __forceinline__ uchar saturate_cast(ushort v) + template<> __device__ __forceinline__ uchar saturate_cast(ushort v) { return (uchar)min((uint)v, (uint)UCHAR_MAX); } - template<> static __device__ __forceinline__ uchar saturate_cast(int v) + template<> __device__ __forceinline__ uchar saturate_cast(int v) { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } - template<> static __device__ __forceinline__ uchar saturate_cast(uint v) + template<> __device__ __forceinline__ uchar saturate_cast(uint v) { return (uchar)min(v, (uint)UCHAR_MAX); } - template<> static __device__ __forceinline__ uchar saturate_cast(short v) + template<> __device__ __forceinline__ uchar saturate_cast(short v) { return saturate_cast((uint)v); } - template<> static __device__ __forceinline__ uchar saturate_cast(float v) + template<> __device__ __forceinline__ uchar saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ __forceinline__ uchar saturate_cast(double v) + template<> __device__ __forceinline__ uchar saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -78,23 +78,23 @@ namespace cv { namespace gpu { namespace device #endif } - template<> static __device__ __forceinline__ schar saturate_cast(uchar v) + template<> __device__ __forceinline__ schar saturate_cast(uchar v) { return (schar)min((int)v, SCHAR_MAX); } - template<> static __device__ __forceinline__ schar saturate_cast(ushort v) + template<> __device__ __forceinline__ schar saturate_cast(ushort v) { return (schar)min((uint)v, (uint)SCHAR_MAX); } - template<> static __device__ __forceinline__ schar saturate_cast(int v) + template<> __device__ __forceinline__ schar saturate_cast(int v) { return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } - template<> static __device__ __forceinline__ schar saturate_cast(short v) + template<> __device__ __forceinline__ schar saturate_cast(short v) { return saturate_cast((int)v); } - template<> static __device__ __forceinline__ schar saturate_cast(uint v) + template<> __device__ __forceinline__ schar saturate_cast(uint v) { return (schar)min(v, (uint)SCHAR_MAX); } - template<> static __device__ __forceinline__ schar saturate_cast(float v) + template<> __device__ __forceinline__ schar saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ __forceinline__ schar saturate_cast(double v) + template<> __device__ __forceinline__ schar saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -103,17 +103,17 @@ namespace cv { namespace gpu { namespace device #endif } - template<> static __device__ __forceinline__ ushort saturate_cast(schar v) + template<> __device__ __forceinline__ ushort saturate_cast(schar v) { return (ushort)max((int)v, 0); } - template<> static __device__ __forceinline__ ushort saturate_cast(short v) + template<> __device__ __forceinline__ ushort saturate_cast(short v) { return (ushort)max((int)v, 0); } - template<> static __device__ __forceinline__ ushort saturate_cast(int v) + template<> __device__ __forceinline__ ushort saturate_cast(int v) { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } - template<> static __device__ __forceinline__ ushort saturate_cast(uint v) + template<> __device__ __forceinline__ ushort saturate_cast(uint v) { return (ushort)min(v, (uint)USHRT_MAX); } - template<> static __device__ __forceinline__ ushort saturate_cast(float v) + template<> __device__ __forceinline__ ushort saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ __forceinline__ ushort saturate_cast(double v) + template<> __device__ __forceinline__ ushort saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -122,18 +122,18 @@ namespace cv { namespace gpu { namespace device #endif } - template<> static __device__ __forceinline__ short saturate_cast(ushort v) + template<> __device__ __forceinline__ short saturate_cast(ushort v) { return (short)min((int)v, SHRT_MAX); } - template<> static __device__ __forceinline__ short saturate_cast(int v) + template<> __device__ __forceinline__ short saturate_cast(int v) { return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } - template<> static __device__ __forceinline__ short saturate_cast(uint v) + template<> __device__ __forceinline__ short saturate_cast(uint v) { return (short)min(v, (uint)SHRT_MAX); } - template<> static __device__ __forceinline__ short saturate_cast(float v) + template<> __device__ __forceinline__ short saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ __forceinline__ short saturate_cast(double v) + template<> __device__ __forceinline__ short saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -142,8 +142,8 @@ namespace cv { namespace gpu { namespace device #endif } - template<> static __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); } - template<> static __device__ __forceinline__ int saturate_cast(double v) + template<> __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); } + template<> __device__ __forceinline__ int saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 return __double2int_rn(v); @@ -152,8 +152,8 @@ namespace cv { namespace gpu { namespace device #endif } - template<> static __device__ __forceinline__ uint saturate_cast(float v){ return __float2uint_rn(v); } - template<> static __device__ __forceinline__ uint saturate_cast(double v) + template<> __device__ __forceinline__ uint saturate_cast(float v){ return __float2uint_rn(v); } + template<> __device__ __forceinline__ uint saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 return __double2uint_rn(v); diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index 4f756e3..92d5065 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -43,33 +43,31 @@ #ifndef __OPENCV_GPU_TRANSFORM_HPP__ #define __OPENCV_GPU_TRANSFORM_HPP__ -#include "detail/transform.hpp" +#include "detail/transform_detail.hpp" +#include "utility.hpp" namespace cv { namespace gpu { namespace device { template - static void transform(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, cudaStream_t stream = 0) + void transform(const DevMem2D_& src, const DevMem2D_& dst, const UnOp& op, cudaStream_t stream = 0) { - detail::transform_caller(src, dst, op, detail::NoMask(), stream); + detail::transform_caller(src, dst, op, WithOutMask(), stream); } template - static void transform(const DevMem2D_& src, const DevMem2D_& dst, const PtrStep& mask, const UnOp& op, - cudaStream_t stream = 0) + void transform(const DevMem2D_& src, const DevMem2D_& dst, const PtrStep& mask, const UnOp& op, cudaStream_t stream = 0) { - detail::transform_caller(src, dst, op, detail::MaskReader(mask), stream); + detail::transform_caller(src, dst, op, SingleMask(mask), stream); } template - static void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, - const BinOp& op, cudaStream_t stream = 0) + void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const BinOp& op, cudaStream_t stream = 0) { - detail::transform_caller(src1, src2, dst, op, detail::NoMask(), stream); + detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream); } template - static void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, - const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0) + void transform(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0) { - detail::transform_caller(src1, src2, dst, op, detail::MaskReader(mask), stream); + detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream); } }}} diff --git a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp new file mode 100644 index 0000000..24f02ef --- /dev/null +++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp @@ -0,0 +1,80 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__ +#define __OPENCV_GPU_TYPE_TRAITS_HPP__ + +#include "detail/type_traits_detail.hpp" + +namespace cv { namespace gpu { namespace device +{ + template struct IsSimpleParameter + { + enum {value = detail::IsIntegral::value || detail::IsFloat::value || detail::PointerTraits::type>::value}; + }; + + template struct TypeTraits + { + typedef typename detail::UnConst::type NonConstType; + typedef typename detail::UnVolatile::type NonVolatileType; + typedef typename detail::UnVolatile::type>::type UnqualifiedType; + typedef typename detail::PointerTraits::type PointeeType; + typedef typename detail::ReferenceTraits::type ReferredType; + + enum { isConst = detail::UnConst::value }; + enum { isVolatile = detail::UnVolatile::value }; + + enum { isReference = detail::ReferenceTraits::value }; + enum { isPointer = detail::PointerTraits::type>::value }; + + enum { isUnsignedInt = detail::IsUnsignedIntegral::value }; + enum { isSignedInt = detail::IsSignedIntergral::value }; + enum { isIntegral = detail::IsIntegral::value }; + enum { isFloat = detail::IsFloat::value }; + enum { isArith = isIntegral || isFloat }; + enum { isVec = detail::IsVec::value }; + + typedef typename detail::Select::value, T, typename detail::AddParameterType::type>::type ParameterType; + }; +}}} + +#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index b0dca8a..3fd84a0 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -45,112 +45,275 @@ #include "internal_shared.hpp" #include "saturate_cast.hpp" - -#ifndef __CUDA_ARCH__ - #define __CUDA_ARCH__ 0 -#endif +#include "datamov_utils.hpp" +#include "functional.hpp" +#include "detail/utility_detail.hpp" #define OPENCV_GPU_LOG_WARP_SIZE (5) #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) #define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla #define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS) -#if defined(_WIN64) || defined(__LP64__) - // 64-bit register modifier for inlined asm - #define OPENCV_GPU_ASM_PTR "l" -#else - // 32-bit register modifier for inlined asm - #define OPENCV_GPU_ASM_PTR "r" -#endif - namespace cv { namespace gpu { namespace device { - template void __host__ __device__ __forceinline__ swap(T& a, T& b) + /////////////////////////////////////////////////////////////////////////////// + // swap + + template void __device__ __forceinline__ swap(T& a, T& b) { const T temp = a; a = b; b = temp; } - // warp-synchronous 32 elements reduction - template __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, const Op& op) + /////////////////////////////////////////////////////////////////////////////// + // Mask Reader + + struct SingleMask { - data[tid] = partial_reduction; + explicit __host__ __device__ __forceinline__ SingleMask(const PtrStep& mask_) : mask(mask_) {} + + __device__ __forceinline__ bool operator()(int y, int x) const + { + return mask.ptr(y)[x] != 0; + } + + const PtrStep mask; + }; - if (tid < 16) + struct MaskCollection + { + explicit __host__ __device__ __forceinline__ MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {} + + __device__ __forceinline__ void next() { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + curMask = *maskCollection++; + } + __device__ __forceinline__ void setMask(int z) + { + curMask = maskCollection[z]; + } + + __device__ __forceinline__ bool operator()(int y, int x) const + { + uchar val; + return curMask.data == 0 || (ForceGlob::Load(curMask.ptr(y), x, val), (val != 0)); } - } - // warp-synchronous 16 elements reduction - template __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, const Op& op) + const PtrStep* maskCollection; + PtrStep curMask; + }; + + struct WithOutMask { - data[tid] = partial_reduction; + __device__ __forceinline__ void next() const + { + } + __device__ __forceinline__ void setMask(int) const + { + } - if (tid < 8) + __device__ __forceinline__ bool operator()(int, int) const { - data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 1 ]); + return true; } + }; + + /////////////////////////////////////////////////////////////////////////////// + // Reduction + + // reduction + template __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + { + StaticAssert= 8 && n <= 512>::check(); + detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); + } + + template + __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) + { + StaticAssert= 8 && n <= 512>::check(); + detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); } - // warp-synchronous reduction - template __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + /////////////////////////////////////////////////////////////////////////////// + // Vector Distance + + template struct L1Dist + { + typedef int value_type; + typedef int result_type; + + __device__ __forceinline__ L1Dist() : mySum(0) {} + + __device__ __forceinline__ void reduceIter(int val1, int val2) + { + mySum = __sad(val1, val2, mySum); + } + + template __device__ __forceinline__ void reduceAll(int* smem, int tid) + { + reduce(smem, mySum, tid, plus()); + } + + __device__ __forceinline__ operator int() const + { + return mySum; + } + + int mySum; + }; + template <> struct L1Dist + { + typedef float value_type; + typedef float result_type; + + __device__ __forceinline__ L1Dist() : mySum(0.0f) {} + + __device__ __forceinline__ void reduceIter(float val1, float val2) + { + mySum += ::fabs(val1 - val2); + } + + template __device__ __forceinline__ void reduceAll(float* smem, int tid) + { + reduce(smem, mySum, tid, plus()); + } + + __device__ __forceinline__ operator float() const + { + return mySum; + } + + float mySum; + }; + + struct L2Dist + { + typedef float value_type; + typedef float result_type; + + __device__ __forceinline__ L2Dist() : mySum(0.0f) {} + + __device__ __forceinline__ void reduceIter(float val1, float val2) + { + float reg = val1 - val2; + mySum += reg * reg; + } + + template __device__ __forceinline__ void reduceAll(float* smem, int tid) + { + reduce(smem, mySum, tid, plus()); + } + + __device__ __forceinline__ operator float() const + { + return sqrtf(mySum); + } + + float mySum; + }; + + struct HammingDist + { + typedef int value_type; + typedef int result_type; + + __device__ __forceinline__ HammingDist() : mySum(0) {} + + __device__ __forceinline__ void reduceIter(int val1, int val2) + { + mySum += __popc(val1 ^ val2); + } + + template __device__ __forceinline__ void reduceAll(int* smem, int tid) + { + reduce(smem, mySum, tid, plus()); + } + + __device__ __forceinline__ operator int() const + { + return mySum; + } + + int mySum; + }; + + // calc distance between two vectors in global memory + template + __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) { - if (tid < n) - data[tid] = partial_reduction; - - if (n > 16) - { - if (tid < n - 16) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 8) - { - if (tid < n - 8) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 4) - { - if (tid < n - 4) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 2) - { - if (tid < n - 2) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } + for (int i = tid; i < len; i += THREAD_DIM) + { + T1 val1; + ForceGlob::Load(vec1, i, val1); + + T2 val2; + ForceGlob::Load(vec2, i, val2); + + dist.reduceIter(val1, val2); + } + + dist.reduceAll(smem, tid); } + // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory + template + __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid) + { + detail::VecDiffCachedCalculator::calc(vecCached, vecGlob, len, dist, tid); + + dist.reduceAll(smem, tid); + } + + // calc distance between two vectors in global memory + template struct VecDiffGlobal + { + explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0) + { + vec1 = vec1_; + } + + template + __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const + { + calcVecDiffGlobal(vec1, vec2, len, dist, smem, tid); + } + + const T1* vec1; + }; + + // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory + template struct VecDiffCachedRegister + { + template __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid) + { + if (glob_tid < len) + smem[glob_tid] = vec1[glob_tid]; + __syncthreads(); + + U* vec1ValsPtr = vec1Vals; + + #pragma unroll + for (int i = tid; i < MAX_LEN; i += THREAD_DIM) + *vec1ValsPtr++ = smem[i]; + + __syncthreads(); + } + + template + __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const + { + calcVecDiffCached(vec1Vals, vec2, len, dist, smem, tid); + } + + U vec1Vals[MAX_LEN / THREAD_DIM]; + }; + + + /////////////////////////////////////////////////////////////////////////////// + // Solve linear system + // solve 2x2 linear system Ax=b template __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2]) { diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp index 5c0051c..48aa62f 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp @@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device template struct SatCastHelper; template struct SatCastHelper<1, VecD> { - template static __device__ VecD cast(const VecS& v) + template static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_type D; return VecTraits::make(saturate_cast(v.x)); @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device }; template struct SatCastHelper<2, VecD> { - template static __device__ VecD cast(const VecS& v) + template static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_type D; return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y)); @@ -71,7 +71,7 @@ namespace cv { namespace gpu { namespace device }; template struct SatCastHelper<3, VecD> { - template static __device__ VecD cast(const VecS& v) + template static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_type D; return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z)); @@ -79,72 +79,72 @@ namespace cv { namespace gpu { namespace device }; template struct SatCastHelper<4, VecD> { - template static __device__ VecD cast(const VecS& v) + template static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_type D; return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } }; - template static __device__ VecD saturate_cast_caller(const VecS& v) + template static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v) { return SatCastHelper::cn, VecD>::cast(v); } } - template static __device__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);} #define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \ - static __device__ TypeVec::result_type, 1>::vec_type op(const type ## 1 & a) \ + __device__ __forceinline__ TypeVec::result_type, 1>::vec_type op(const type ## 1 & a) \ { \ func f; \ return VecTraits::result_type, 1>::vec_type>::make(f(a.x)); \ } \ - static __device__ TypeVec::result_type, 2>::vec_type op(const type ## 2 & a) \ + __device__ __forceinline__ TypeVec::result_type, 2>::vec_type op(const type ## 2 & a) \ { \ func f; \ return VecTraits::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \ } \ - static __device__ TypeVec::result_type, 3>::vec_type op(const type ## 3 & a) \ + __device__ __forceinline__ TypeVec::result_type, 3>::vec_type op(const type ## 3 & a) \ { \ func f; \ return VecTraits::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \ } \ - static __device__ TypeVec::result_type, 4>::vec_type op(const type ## 4 & a) \ + __device__ __forceinline__ TypeVec::result_type, 4>::vec_type op(const type ## 4 & a) \ { \ func f; \ return VecTraits::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \ @@ -195,70 +195,70 @@ namespace cv { namespace gpu { namespace device } #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \ - static __device__ TypeVec::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \ + __device__ __forceinline__ TypeVec::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \ { \ func f; \ return VecTraits::result_type, 1>::vec_type>::make(f(a.x, b.x)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \ } \ - static __device__ TypeVec::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \ + __device__ __forceinline__ TypeVec::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \ { \ func f; \ return VecTraits::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \ } \ - static __device__ TypeVec::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \ + __device__ __forceinline__ TypeVec::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \ { \ func f; \ return VecTraits::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \ } \ - static __device__ TypeVec::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \ + __device__ __forceinline__ TypeVec::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \ { \ func f; \ return VecTraits::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \ } \ template \ - static __device__ typename TypeVec::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \ + __device__ __forceinline__ typename TypeVec::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \ { \ func::argument_type> f; \ return VecTraits::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \ diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index 169a10b..b3db078 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -3642,19 +3642,24 @@ INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplateBlackSource, testing::Combine( testing::Values((int)CV_TM_CCOEFF_NORMED, (int)CV_TM_CCORR_NORMED))); -struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple > > +struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple > > { cv::gpu::DeviceInfo devInfo; + std::string imageName; + std::string patternName; + cv::Mat image, pattern; virtual void SetUp() { devInfo = std::tr1::get<0>(GetParam()); + imageName = std::tr1::get<1>(GetParam()).first; + patternName = std::tr1::get<1>(GetParam()).second; - image = readImage(std::tr1::get<0>(std::tr1::get<1>(GetParam()))); + image = readImage(imageName); ASSERT_FALSE(image.empty()); - pattern = readImage(std::tr1::get<1>(std::tr1::get<1>(GetParam()))); + pattern = readImage(patternName); ASSERT_FALSE(pattern.empty()); } }; @@ -3662,6 +3667,8 @@ struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple > matcher; @@ -329,7 +329,7 @@ TEST(BruteForceMatcher) GPU_OFF; SUBTEST << "radiusMatch"; - float max_distance = 3.8f; + float max_distance = 2.0f; CPU_ON; matcher.radiusMatch(query, train, matches, max_distance); -- 2.7.4