From: Vladislav Vinogradov Date: Mon, 26 Sep 2011 11:18:30 +0000 (+0000) Subject: implemented optimized version of gpu::bf_radius_match X-Git-Tag: accepted/2.0/20130307.220821~1809 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b119833ad1250daf11422039de9a4f427e2b543d;p=profile%2Fivi%2Fopencv.git implemented optimized version of gpu::bf_radius_match --- diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index c56dcda..be081e9 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -76,7 +76,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector #else /* !defined (HAVE_CUDA) */ -namespace cv { namespace gpu { namespace bfmatcher +namespace cv { namespace gpu { namespace bf_match { template void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, @@ -97,7 +97,10 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream); +}}} +namespace cv { namespace gpu { namespace bf_knnmatch +{ template void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); @@ -107,7 +110,10 @@ namespace cv { namespace gpu { namespace bfmatcher template void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream); +}}} +namespace cv { namespace gpu { namespace bf_radius_match +{ template void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream); @@ -170,7 +176,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, if (queryDescs.empty() || trainDescs.empty()) return; - using namespace cv::gpu::bfmatcher; + using namespace cv::gpu::bf_match; typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, @@ -309,7 +315,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes if (queryDescs.empty() || trainCollection.empty()) return; - using namespace cv::gpu::bfmatcher; + using namespace cv::gpu::bf_match; typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, @@ -418,7 +424,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con if (queryDescs.empty() || trainDescs.empty()) return; - using namespace cv::gpu::bfmatcher; + using namespace cv::gpu::bf_knnmatch; typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, @@ -596,7 +602,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, if (queryDescs.empty() || trainDescs.empty()) return; - using namespace cv::gpu::bfmatcher; + using namespace cv::gpu::bf_radius_match; typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, @@ -618,7 +624,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, } }; - CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)); + CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)); const int nQuery = queryDescs.rows; const int nTrain = trainDescs.rows; diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 079251e..d67c92a 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -47,7 +47,7 @@ using namespace cv::gpu; using namespace cv::gpu::device; -namespace cv { namespace gpu { namespace bfmatcher +namespace cv { namespace gpu { namespace bf_knnmatch { template __device__ void distanceCalcLoop(const PtrStep_& query, const DevMem2D_& train, const Mask& m, int queryIdx, diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index 83f48b4..d2bb120 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -47,7 +47,7 @@ using namespace cv::gpu; using namespace cv::gpu::device; -namespace cv { namespace gpu { namespace bfmatcher +namespace cv { namespace gpu { namespace bf_match { template __device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx) diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 5dc8fab..bd66451 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -47,81 +47,191 @@ using namespace cv::gpu; using namespace cv::gpu::device; -namespace cv { namespace gpu { namespace bfmatcher +namespace cv { namespace gpu { namespace bf_radius_match { - template - __global__ void radiusMatch(const PtrStep_ query, const DevMem2D_ train, float maxDistance, const Mask mask, - DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance) + __device__ __forceinline__ void store(const int* sidx, const float* sdist, const unsigned int scount, int* trainIdx, float* distance, int& sglob_ind, const int tid) { - #if __CUDA_ARCH__ >= 110 + if (tid < scount) + { + trainIdx[sglob_ind + tid] = sidx[tid]; + distance[sglob_ind + tid] = sdist[tid]; + } - __shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + if (tid == 0) + sglob_ind += scount; + } - typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y; - - const int queryIdx = blockIdx.x; - const T* queryDescs = query.ptr(queryIdx); + template + __global__ void radiusMatch(const PtrStep_ query, const DevMem2D_ train, const float maxDistance, const Mask mask, + DevMem2Di trainIdx_, PtrStepf distance, unsigned int* nMatches) + { + #if __CUDA_ARCH__ >= 120 + + typedef typename Dist::result_type result_type; + typedef typename Dist::value_type value_type; + + __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]; + __shared__ int sidx[BLOCK_STACK]; + __shared__ float sdist[BLOCK_STACK]; + __shared__ unsigned int scount; + __shared__ int sglob_ind; - const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + const int queryIdx = blockIdx.x; + const int tid = threadIdx.y * BLOCK_DIM_X + threadIdx.x; - if (trainIdx < train.rows) + if (tid == 0) { - const T* trainDescs = train.ptr(trainIdx); + scount = 0; + sglob_ind = 0; + } + __syncthreads(); + + int* trainIdx_row = trainIdx_.ptr(queryIdx); + float* distance_row = distance.ptr(queryIdx); + + const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, tid, threadIdx.x); + + typename Dist::result_type* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y; + for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y) + { if (mask(queryIdx, trainIdx)) { Dist dist; - calcVecDiffGlobal(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x); + const T* trainRow = train.ptr(trainIdx); + + vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x); - if (threadIdx.x == 0) + const typename Dist::result_type val = dist; + + if (threadIdx.x == 0 && val < maxDistance) { - if (dist < maxDistance) - { - unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1); - if (i < trainIdx_.cols) - { - distance.ptr(queryIdx)[i] = dist; - trainIdx_.ptr(queryIdx)[i] = trainIdx; - } - } + unsigned int i = atomicInc(&scount, (unsigned int) -1); + sidx[i] = trainIdx; + sdist[i] = val; } } + __syncthreads(); + + if (scount > BLOCK_STACK - BLOCK_DIM_Y) + { + store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid); + if (tid == 0) + scount = 0; + } + __syncthreads(); } + store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid); + + if (tid == 0) + nMatches[queryIdx] = sglob_ind; + #endif } - + /////////////////////////////////////////////////////////////////////////////// // Radius Match kernel caller - template - void radiusMatch_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2D_& nMatches, const DevMem2Df& distance, + template + void radiusMatchSimple_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches, cudaStream_t stream) { + StaticAssert= BLOCK_DIM_Y>::check(); + StaticAssert::check(); + + const dim3 grid(query.rows, 1, 1); const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); - const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1); - radiusMatch<<>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance); + radiusMatch, Dist, T> + <<>>(query, train, maxDistance, mask, trainIdx, distance, nMatches); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - + + template + void radiusMatchCached_caller(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches, + cudaStream_t stream) + { + StaticAssert= BLOCK_DIM_Y>::check(); + StaticAssert::check(); + StaticAssert= MAX_LEN>::check(); + StaticAssert::check(); + + const dim3 grid(query.rows, 1, 1); + const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + + radiusMatch, Dist, T> + <<>>(query, train, maxDistance, mask, trainIdx, distance, nMatches); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + /////////////////////////////////////////////////////////////////////////////// // Radius Match Dispatcher - + template void radiusMatchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, + const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches, cudaStream_t stream) { - radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask, - static_cast(trainIdx), static_cast< const DevMem2D_ >(nMatches), static_cast(distance), - stream); - } + if (query.cols < 64) + { + radiusMatchCached_caller<16, 16, 64, 64, false, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + else if (query.cols == 64) + { + radiusMatchCached_caller<16, 16, 64, 64, true, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + else if (query.cols < 128) + { + radiusMatchCached_caller<16, 16, 64, 128, false, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + else if (query.cols == 128) + { + radiusMatchCached_caller<16, 16, 64, 128, true, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + else if (query.cols < 256) + { + radiusMatchCached_caller<16, 16, 64, 256, false, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + else if (query.cols == 256) + { + radiusMatchCached_caller<16, 16, 64, 256, true, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + else + { + radiusMatchSimple_caller<16, 16, 64, Dist>( + query, train, maxDistance, mask, + static_cast(trainIdx), static_cast(distance), (unsigned int*)nMatches.data, + stream); + } + } /////////////////////////////////////////////////////////////////////////////// // Radius Match caller @@ -133,13 +243,13 @@ namespace cv { namespace gpu { namespace bfmatcher if (mask.data) { radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, nMatches, distance, + trainIdx, distance, nMatches, stream); } else { radiusMatchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, nMatches, distance, + trainIdx, distance, nMatches, stream); } } @@ -158,13 +268,13 @@ namespace cv { namespace gpu { namespace bfmatcher if (mask.data) { radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, nMatches, distance, + trainIdx, distance, nMatches, stream); } else { radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, nMatches, distance, + trainIdx, distance, nMatches, stream); } } @@ -183,13 +293,13 @@ namespace cv { namespace gpu { namespace bfmatcher if (mask.data) { radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, nMatches, distance, + trainIdx, distance, nMatches, stream); } else { radiusMatchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, nMatches, distance, + trainIdx, distance, nMatches, stream); } }