From ed86bd34b190b2d6dba3f8bccd6070f0e5062b93 Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Thu, 10 May 2018 05:44:34 +0900 Subject: [PATCH] Merge pull request #11483 from nglee:dev_cudaCannyStreamIssue cuda_canny : multi stream safety (#11483) * CUDA_ImgProc/Canny Asynchronous test * cuda_canny : multi stream safety (1/3) - Convert global variable canny::counter to class local variable * cuda_canny : multi stream safety (2/3) - Use texture objects rather than texture reference for cc >= 3.0, since texture reference must be declared as a static global variable which results in race condition when ran concurrently * cuda_canny : multi stream safety (3/3) - Refrain from using global variable in row_filter and column_filter (converts column_filter::c_kernel and row_filter::c_kernel to local variables) * Fixes #11193 --- modules/cudafilters/src/cuda/column_filter.hpp | 19 +- modules/cudafilters/src/cuda/row_filter.hpp | 19 +- modules/cudaimgproc/src/canny.cpp | 17 +- modules/cudaimgproc/src/cuda/canny.cu | 230 +++++++++++++++++++++---- modules/cudaimgproc/test/test_canny.cpp | 59 ++++++- 5 files changed, 275 insertions(+), 69 deletions(-) diff --git a/modules/cudafilters/src/cuda/column_filter.hpp b/modules/cudafilters/src/cuda/column_filter.hpp index 7dc339c..e93fc83 100644 --- a/modules/cudafilters/src/cuda/column_filter.hpp +++ b/modules/cudafilters/src/cuda/column_filter.hpp @@ -52,10 +52,8 @@ namespace column_filter { #define MAX_KERNEL_SIZE 32 - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - template - __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const float* kernel, const int anchor, const B brd) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) const int BLOCK_DIM_X = 16; @@ -135,7 +133,7 @@ namespace column_filter #pragma unroll for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k]; dst(y, x) = saturate_cast(sum); } @@ -143,7 +141,7 @@ namespace column_filter } template class B> - void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + void caller(PtrStepSz src, PtrStepSz dst, const float* kernel, int anchor, int cc, cudaStream_t stream) { int BLOCK_DIM_X; int BLOCK_DIM_Y; @@ -167,7 +165,7 @@ namespace column_filter B brd(src.rows); - linearColumnFilter<<>>(src, dst, anchor, brd); + linearColumnFilter<<>>(src, dst, kernel, anchor, brd); cudaSafeCall( cudaGetLastError() ); @@ -181,7 +179,7 @@ namespace filter template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, const float* kernel, int anchor, int cc, cudaStream_t stream); static const caller_t callers[5][33] = { @@ -362,11 +360,6 @@ namespace filter } }; - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, kernel, anchor, cc, stream); } } diff --git a/modules/cudafilters/src/cuda/row_filter.hpp b/modules/cudafilters/src/cuda/row_filter.hpp index 80eab59..4a4be36 100644 --- a/modules/cudafilters/src/cuda/row_filter.hpp +++ b/modules/cudafilters/src/cuda/row_filter.hpp @@ -52,10 +52,8 @@ namespace row_filter { #define MAX_KERNEL_SIZE 32 - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - template - __global__ void linearRowFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + __global__ void linearRowFilter(const PtrStepSz src, PtrStep dst, const float* kernel, const int anchor, const B brd) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) const int BLOCK_DIM_X = 32; @@ -135,7 +133,7 @@ namespace row_filter #pragma unroll for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; + sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k]; dst(y, x) = saturate_cast(sum); } @@ -143,7 +141,7 @@ namespace row_filter } template class B> - void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + void caller(PtrStepSz src, PtrStepSz dst, const float* kernel, int anchor, int cc, cudaStream_t stream) { int BLOCK_DIM_X; int BLOCK_DIM_Y; @@ -167,7 +165,7 @@ namespace row_filter B brd(src.cols); - linearRowFilter<<>>(src, dst, anchor, brd); + linearRowFilter<<>>(src, dst, kernel, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -180,7 +178,7 @@ namespace filter template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, const float* kernel, int anchor, int cc, cudaStream_t stream); static const caller_t callers[5][33] = { @@ -361,11 +359,6 @@ namespace filter } }; - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, kernel, anchor, cc, stream); } } diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp index 75e53cf..8c3fd4a 100644 --- a/modules/cudaimgproc/src/canny.cpp +++ b/modules/cudaimgproc/src/canny.cpp @@ -58,9 +58,9 @@ namespace canny void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream); - void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream); + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream); - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream); + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream); void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream); } @@ -127,6 +127,8 @@ namespace Ptr filterDX_, filterDY_; #endif int old_apperture_size_; + + int *d_counter; }; void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream) @@ -218,12 +220,17 @@ namespace void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream) { - map_.setTo(Scalar::all(0)); + map_.setTo(Scalar::all(0), stream); + canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_), StreamAccessor::getStream(stream)); - canny::edgesHysteresisLocal(map_, st1_.ptr(), StreamAccessor::getStream(stream)); + cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) ); + + canny::edgesHysteresisLocal(map_, st1_.ptr(), d_counter, StreamAccessor::getStream(stream)); + + canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr(), d_counter, StreamAccessor::getStream(stream)); - canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr(), StreamAccessor::getStream(stream)); + cudaSafeCall( cudaFree(d_counter) ); canny::getEdges(map_, edges, StreamAccessor::getStream(stream)); } diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu index e0ba515..4418b8e 100644 --- a/modules/cudaimgproc/src/cuda/canny.cu +++ b/modules/cudaimgproc/src/cuda/canny.cu @@ -47,6 +47,7 @@ #include "opencv2/core/cuda/transform.hpp" #include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/utility.hpp" +#include "opencv2/core/cuda.hpp" using namespace cv::cuda; using namespace cv::cuda::device; @@ -102,6 +103,20 @@ namespace canny } }; + struct SrcTexObject + { + int xoff; + int yoff; + cudaTextureObject_t tex_src_object; + __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { } + + __device__ __forceinline__ int operator ()(int y, int x) const + { + return tex2D(tex_src_object, x + xoff, y + yoff); + } + + }; + template __global__ void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) { @@ -120,29 +135,94 @@ namespace canny mag(y, x) = norm(dxVal, dyVal); } + template __global__ + void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y >= mag.rows || x >= mag.cols) + return; + + int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); + int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); + + dx(y, x) = dxVal; + dy(y, x) = dyVal; + + mag(y, x) = norm(dxVal, dyVal); + } + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); - bindTexture(&tex_src, srcWhole); - SrcTex src(xoff, yoff); + bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); - if (L2Grad) + if (cc30) { - L2 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = srcWhole.ptr(); + resDesc.res.pitch2D.height = srcWhole.rows; + resDesc.res.pitch2D.width = srcWhole.cols; + resDesc.res.pitch2D.pitchInBytes = srcWhole.step; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = cudaAddressModeClamp; + texDesc.addressMode[1] = cudaAddressModeClamp; + texDesc.addressMode[2] = cudaAddressModeClamp; + + cudaTextureObject_t tex = 0; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + + SrcTexObject src(xoff, yoff, tex); + + if (L2Grad) + { + L2 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } + + cudaSafeCall( cudaGetLastError() ); + + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); + else + cudaSafeCall( cudaStreamSynchronize(stream) ); + + cudaSafeCall( cudaDestroyTextureObject(tex) ); } else { - L1 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); - } + bindTexture(&tex_src, srcWhole); + SrcTex src(xoff, yoff); - cudaSafeCall( cudaGetLastError() ); + if (L2Grad) + { + L2 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } - if (stream == NULL) - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaGetLastError() ); + + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); + } } void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) @@ -165,7 +245,6 @@ namespace canny namespace canny { texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); - __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) { const int CANNY_SHIFT = 15; @@ -218,18 +297,103 @@ namespace canny map(y, x) = edge_type; } + __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag) + { + const int CANNY_SHIFT = 15; + const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols - 1 || y == 0 || y >= dx.rows - 1) + return; + + int dxVal = dx(y, x); + int dyVal = dy(y, x); + + const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; + const float m = tex2D(tex_mag, x, y); + + dxVal = ::abs(dxVal); + dyVal = ::abs(dyVal); + + // 0 - the pixel can not belong to an edge + // 1 - the pixel might belong to an edge + // 2 - the pixel does belong to an edge + int edge_type = 0; + + if (m > low_thresh) + { + const int tg22x = dxVal * TG22; + const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); + + dyVal <<= CANNY_SHIFT; + + if (dyVal < tg22x) + { + if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + edge_type = 1 + (int)(m > high_thresh); + } + else if(dyVal > tg67x) + { + if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + else + { + if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + } + + map(y, x) = edge_type; + } + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); - bindTexture(&tex_mag, mag); + if (deviceSupports(FEATURE_SET_COMPUTE_30)) + { + // Use the texture object + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = mag.ptr(); + resDesc.res.pitch2D.height = mag.rows; + resDesc.res.pitch2D.width = mag.cols; + resDesc.res.pitch2D.pitchInBytes = mag.step; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = cudaAddressModeClamp; + texDesc.addressMode[1] = cudaAddressModeClamp; + texDesc.addressMode[2] = cudaAddressModeClamp; + + cudaTextureObject_t tex=0; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh, tex); + cudaSafeCall( cudaGetLastError() ); - calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); + else + cudaSafeCall( cudaStreamSynchronize(stream) ); - if (stream == NULL) - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaDestroyTextureObject(tex) ); + } + else + { + // Use the texture reference + bindTexture(&tex_mag, mag); + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); + + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); + } } } @@ -237,14 +401,12 @@ namespace canny namespace canny { - __device__ int counter = 0; - __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols) { return (y >= 0) && (y < rows) && (x >= 0) && (x < cols); } - __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st) + __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter) { __shared__ volatile int smem[18][18]; @@ -325,22 +487,19 @@ namespace canny if (n > 0) { - const int ind = ::atomicAdd(&counter, 1); + const int ind = ::atomicAdd(d_counter, 1); st[ind] = make_short2(x, y); } } - void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream) + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); + cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) ); const dim3 block(16, 16); const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - edgesHysteresisLocalKernel<<>>(map, st1); + edgesHysteresisLocalKernel<<>>(map, st1, d_counter); cudaSafeCall( cudaGetLastError() ); if (stream == NULL) @@ -355,7 +514,7 @@ namespace canny __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count) + __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count) { const int stack_size = 512; @@ -429,7 +588,7 @@ namespace canny { if (threadIdx.x == 0) { - s_ind = ::atomicAdd(&counter, s_counter); + s_ind = ::atomicAdd(d_counter, s_counter); if (s_ind + s_counter > map.cols * map.rows) s_counter = 0; @@ -444,29 +603,26 @@ namespace canny } } - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream) + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); - int count; - cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) ); cudaSafeCall( cudaStreamSynchronize(stream) ); while (count > 0) { - cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); + cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) ); const dim3 block(128); const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); + edgesHysteresisGlobalKernel<<>>(map, st1, st2, d_counter, count); cudaSafeCall( cudaGetLastError() ); if (stream == NULL) cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) ); cudaSafeCall( cudaStreamSynchronize(stream) ); count = min(count, map.cols * map.rows); diff --git a/modules/cudaimgproc/test/test_canny.cpp b/modules/cudaimgproc/test/test_canny.cpp index 2b1a8d5..a782a87 100644 --- a/modules/cudaimgproc/test/test_canny.cpp +++ b/modules/cudaimgproc/test/test_canny.cpp @@ -92,9 +92,66 @@ CUDA_TEST_P(Canny, Accuracy) EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2); } +class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody +{ +public: + CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_) + : d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {} + ~CannyAsyncParallelLoopBody() {}; + void operator()(const cv::Range& r) const + { + for (int i = r.start; i < r.end; i++) { + cv::cuda::Stream stream; + cv::Ptr canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient); + canny->detect(d_img, edges[i], stream); + stream.waitForCompletion(); + } + } +protected: + const cv::cuda::GpuMat& d_img; + cv::cuda::GpuMat* edges; + double low_thresh; + double high_thresh; + int apperture_size; + bool useL2gradient; +}; + +#define NUM_STREAMS 64 + +CUDA_TEST_P(Canny, Async) +{ + if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30)) + { + throw SkipTestException("CUDA device doesn't support texture objects"); + } + else + { + const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(img.empty()); + + const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi); + + double low_thresh = 50.0; + double high_thresh = 100.0; + + // Synchronous call + cv::Ptr canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient); + cv::cuda::GpuMat edges_gold; + canny->detect(d_img_roi, edges_gold); + + // Asynchronous call + cv::cuda::GpuMat edges[NUM_STREAMS]; + cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient)); + + // Compare the results of synchronous call and asynchronous call + for (int i = 0; i < NUM_STREAMS; i++) + EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0); + } + } + INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine( ALL_DEVICES, - testing::Values(AppertureSize(3), AppertureSize(5)), + testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)), testing::Values(L2gradient(false), L2gradient(true)), WHOLE_SUBMAT)); -- 2.7.4