{
namespace fast
{
- __device__ unsigned int g_counter = 0;
-
///////////////////////////////////////////////////////////////////////////
// calcKeypoints
}
template <bool calcScore, class Mask>
- __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
+ __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int* d_counter)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
{
if (calcScore) score(i, j) = cornerScore(C, v, threshold);
- const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
+ const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
if (ind < maxKeypoints)
kpLoc[ind] = make_short2(j, i);
#endif
}
- int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
+ int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream)
{
- void* counter_ptr;
- cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
dim3 block(32, 8);
dim3 grid;
grid.x = divUp(img.cols - 6, block.x);
grid.y = divUp(img.rows - 6, block.y);
- cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
+ cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
if (score.data)
{
if (mask.data)
- calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+ calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
else
- calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+ calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
}
else
{
if (mask.data)
- calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+ calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
else
- calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+ calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
}
cudaSafeCall( cudaGetLastError() );
unsigned int count;
- cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+ cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
///////////////////////////////////////////////////////////////////////////
// nonmaxSuppression
- __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal)
+ __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal, unsigned int* d_counter)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
if (ismax)
{
- const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
+ const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
locFinal[ind] = loc;
responseFinal[ind] = static_cast<float>(score);
#endif
}
- int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
+ int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream)
{
- void* counter_ptr;
- cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
dim3 block(256);
dim3 grid;
grid.x = divUp(count, block.x);
- cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
+ cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
- nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
+ nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response, d_counter);
cudaSafeCall( cudaGetLastError() );
unsigned int new_count;
- cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+ cudaSafeCall( cudaMemcpyAsync(&new_count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) );
{
namespace fast
{
- int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
- int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
+ int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream);
+ int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream);
}
}}}
int threshold_;
bool nonmaxSuppression_;
int max_npoints_;
+
+ unsigned int* d_counter;
};
FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
{
using namespace cv::cuda::device::fast;
+ cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) );
+
const GpuMat img = _image.getGpuMat();
const GpuMat mask = _mask.getGpuMat();
score.setTo(Scalar::all(0), stream);
}
- int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
+ int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream));
count = std::min(count, max_npoints_);
if (count == 0)
if (nonmaxSuppression_)
{
- count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
+ count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream));
if (count == 0)
{
keypoints.release();
kpLoc.colRange(0, count).copyTo(locRow, stream);
keypoints.row(1).setTo(Scalar::all(0), stream);
}
+
+ cudaSafeCall( cudaFree(d_counter) );
}
void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
#ifdef HAVE_CUDA
+#include <cuda_runtime_api.h>
+
namespace opencv_test { namespace {
/////////////////////////////////////////////////////////////////////////////////////////////////
if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
{
- try
- {
- std::vector<cv::KeyPoint> keypoints;
- fast->detect(loadMat(image), keypoints);
- }
- catch (const cv::Exception& e)
- {
- ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
- }
+ throw SkipTestException("CUDA device doesn't support global atomics");
}
else
{
}
}
+class FastAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+ FastAsyncParallelLoopBody(cv::cuda::HostMem& src, cv::cuda::GpuMat* d_kpts, cv::Ptr<cv::cuda::FastFeatureDetector>* d_fast)
+ : src_(src), kpts_(d_kpts), fast_(d_fast) {}
+ ~FastAsyncParallelLoopBody() {};
+ void operator()(const cv::Range& r) const
+ {
+ for (int i = r.start; i < r.end; i++) {
+ cv::cuda::Stream stream;
+ cv::cuda::GpuMat d_src_(src_.rows, src_.cols, CV_8UC1);
+ d_src_.upload(src_);
+ fast_[i]->detectAsync(d_src_, kpts_[i], noArray(), stream);
+ }
+ }
+protected:
+ cv::cuda::HostMem src_;
+ cv::cuda::GpuMat* kpts_;
+ cv::Ptr<cv::cuda::FastFeatureDetector>* fast_;
+};
+
+CUDA_TEST_P(FAST, Async)
+{
+ if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
+ {
+ throw SkipTestException("CUDA device doesn't support global atomics");
+ }
+ else
+ {
+ cv::Mat image_ = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+ ASSERT_FALSE(image_.empty());
+
+ cv::cuda::HostMem image(image_);
+
+ cv::cuda::GpuMat d_keypoints[2];
+ cv::Ptr<cv::cuda::FastFeatureDetector> d_fast[2];
+
+ d_fast[0] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+ d_fast[1] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
+
+ cv::parallel_for_(cv::Range(0, 2), FastAsyncParallelLoopBody(image, d_keypoints, d_fast));
+
+ cudaDeviceSynchronize();
+
+ std::vector<cv::KeyPoint> keypoints[2];
+ d_fast[0]->convert(d_keypoints[0], keypoints[0]);
+ d_fast[1]->convert(d_keypoints[1], keypoints[1]);
+
+ std::vector<cv::KeyPoint> keypoints_gold;
+ cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
+
+ ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[0]);
+ ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[1]);
+ }
+}
+
INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine(
ALL_DEVICES,
testing::Values(FAST_Threshold(25), FAST_Threshold(50)),