From 12ce6b5c1557b90ffd077b6529649fe218ce8e60 Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Wed, 21 Feb 2018 02:55:57 +0900 Subject: [PATCH] Merge pull request #10906 from nglee:dev_cudaFastMultiStreamSafety cuda_fast : multi stream safety (#10906) * CUDA_Features2D/FAST Asynchronous test * cuda_fast : multi stream safety * Use parallel_for instead of OpenMP --- modules/cudafeatures2d/src/cuda/fast.cu | 38 ++++++-------- modules/cudafeatures2d/src/fast.cpp | 14 +++-- modules/cudafeatures2d/test/test_features2d.cpp | 68 +++++++++++++++++++++---- 3 files changed, 84 insertions(+), 36 deletions(-) diff --git a/modules/cudafeatures2d/src/cuda/fast.cu b/modules/cudafeatures2d/src/cuda/fast.cu index 72235d4..5da9e5e 100644 --- a/modules/cudafeatures2d/src/cuda/fast.cu +++ b/modules/cudafeatures2d/src/cuda/fast.cu @@ -49,8 +49,6 @@ namespace cv { namespace cuda { namespace device { namespace fast { - __device__ unsigned int g_counter = 0; - /////////////////////////////////////////////////////////////////////////// // calcKeypoints @@ -218,7 +216,7 @@ namespace cv { namespace cuda { namespace device } template - __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold) + __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int* d_counter) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) @@ -269,7 +267,7 @@ namespace cv { namespace cuda { namespace device { if (calcScore) score(i, j) = cornerScore(C, v, threshold); - const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); + const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1)); if (ind < maxKeypoints) kpLoc[ind] = make_short2(j, i); @@ -279,38 +277,35 @@ namespace cv { namespace cuda { namespace device #endif } - int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream) + int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - dim3 block(32, 8); dim3 grid; grid.x = divUp(img.cols - 6, block.x); grid.y = divUp(img.rows - 6, block.y); - cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) ); + cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) ); if (score.data) { if (mask.data) - calcKeypoints<<>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold); + calcKeypoints<<>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter); else - calcKeypoints<<>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold); + calcKeypoints<<>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter); } else { if (mask.data) - calcKeypoints<<>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold); + calcKeypoints<<>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter); else - calcKeypoints<<>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold); + calcKeypoints<<>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter); } cudaSafeCall( cudaGetLastError() ); unsigned int count; - cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) ); cudaSafeCall( cudaStreamSynchronize(stream) ); @@ -320,7 +315,7 @@ namespace cv { namespace cuda { namespace device /////////////////////////////////////////////////////////////////////////// // nonmaxSuppression - __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal) + __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal, unsigned int* d_counter) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) @@ -346,7 +341,7 @@ namespace cv { namespace cuda { namespace device if (ismax) { - const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); + const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1)); locFinal[ind] = loc; responseFinal[ind] = static_cast(score); @@ -356,23 +351,20 @@ namespace cv { namespace cuda { namespace device #endif } - int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream) + int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - dim3 block(256); dim3 grid; grid.x = divUp(count, block.x); - cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) ); + cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) ); - nonmaxSuppression<<>>(kpLoc, count, score, loc, response); + nonmaxSuppression<<>>(kpLoc, count, score, loc, response, d_counter); cudaSafeCall( cudaGetLastError() ); unsigned int new_count; - cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaMemcpyAsync(&new_count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) ); cudaSafeCall( cudaStreamSynchronize(stream) ); diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp index ce44b3a..e48ef18 100644 --- a/modules/cudafeatures2d/src/fast.cpp +++ b/modules/cudafeatures2d/src/fast.cpp @@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device { namespace fast { - int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream); - int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream); + int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream); + int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream); } }}} @@ -88,6 +88,8 @@ namespace int threshold_; bool nonmaxSuppression_; int max_npoints_; + + unsigned int* d_counter; }; FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) : @@ -114,6 +116,8 @@ namespace { using namespace cv::cuda::device::fast; + cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) ); + const GpuMat img = _image.getGpuMat(); const GpuMat mask = _mask.getGpuMat(); @@ -131,7 +135,7 @@ namespace score.setTo(Scalar::all(0), stream); } - int count = calcKeypoints_gpu(img, mask, kpLoc.ptr(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream)); + int count = calcKeypoints_gpu(img, mask, kpLoc.ptr(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream)); count = std::min(count, max_npoints_); if (count == 0) @@ -145,7 +149,7 @@ namespace if (nonmaxSuppression_) { - count = nonmaxSuppression_gpu(kpLoc.ptr(), count, score, keypoints.ptr(LOCATION_ROW), keypoints.ptr(RESPONSE_ROW), StreamAccessor::getStream(stream)); + count = nonmaxSuppression_gpu(kpLoc.ptr(), count, score, keypoints.ptr(LOCATION_ROW), keypoints.ptr(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream)); if (count == 0) { keypoints.release(); @@ -161,6 +165,8 @@ namespace kpLoc.colRange(0, count).copyTo(locRow, stream); keypoints.row(1).setTo(Scalar::all(0), stream); } + + cudaSafeCall( cudaFree(d_counter) ); } void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector& keypoints) diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp index 8d0c1d3..6ab23be 100644 --- a/modules/cudafeatures2d/test/test_features2d.cpp +++ b/modules/cudafeatures2d/test/test_features2d.cpp @@ -44,6 +44,8 @@ #ifdef HAVE_CUDA +#include + namespace opencv_test { namespace { ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -80,15 +82,7 @@ CUDA_TEST_P(FAST, Accuracy) if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS)) { - try - { - std::vector keypoints; - fast->detect(loadMat(image), keypoints); - } - catch (const cv::Exception& e) - { - ASSERT_EQ(cv::Error::StsNotImplemented, e.code); - } + throw SkipTestException("CUDA device doesn't support global atomics"); } else { @@ -102,6 +96,62 @@ CUDA_TEST_P(FAST, Accuracy) } } +class FastAsyncParallelLoopBody : public cv::ParallelLoopBody +{ +public: + FastAsyncParallelLoopBody(cv::cuda::HostMem& src, cv::cuda::GpuMat* d_kpts, cv::Ptr* d_fast) + : src_(src), kpts_(d_kpts), fast_(d_fast) {} + ~FastAsyncParallelLoopBody() {}; + void operator()(const cv::Range& r) const + { + for (int i = r.start; i < r.end; i++) { + cv::cuda::Stream stream; + cv::cuda::GpuMat d_src_(src_.rows, src_.cols, CV_8UC1); + d_src_.upload(src_); + fast_[i]->detectAsync(d_src_, kpts_[i], noArray(), stream); + } + } +protected: + cv::cuda::HostMem src_; + cv::cuda::GpuMat* kpts_; + cv::Ptr* fast_; +}; + +CUDA_TEST_P(FAST, Async) +{ + if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS)) + { + throw SkipTestException("CUDA device doesn't support global atomics"); + } + else + { + cv::Mat image_ = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(image_.empty()); + + cv::cuda::HostMem image(image_); + + cv::cuda::GpuMat d_keypoints[2]; + cv::Ptr d_fast[2]; + + d_fast[0] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression); + d_fast[1] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression); + + cv::parallel_for_(cv::Range(0, 2), FastAsyncParallelLoopBody(image, d_keypoints, d_fast)); + + cudaDeviceSynchronize(); + + std::vector keypoints[2]; + d_fast[0]->convert(d_keypoints[0], keypoints[0]); + d_fast[1]->convert(d_keypoints[1], keypoints[1]); + + std::vector keypoints_gold; + cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression); + + ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[0]); + ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[1]); + } +} + INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine( ALL_DEVICES, testing::Values(FAST_Threshold(25), FAST_Threshold(50)), -- 2.7.4