From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 09:02:17 +0000 (+0400) Subject: Canny X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~4052^2~56^2~18 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=28716d7f306cfc89d7e5507259e98a05bd9b7b8b;p=platform%2Fupstream%2Fopencv.git Canny --- diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 2cbd450..4396a0a 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -792,31 +792,23 @@ private: GpuMat lab, l, ab; }; - -struct CV_EXPORTS CannyBuf; - -CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); -CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); -CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); -CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); - struct CV_EXPORTS CannyBuf { - CannyBuf() {} - explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);} - CannyBuf(const GpuMat& dx_, const GpuMat& dy_); - void create(const Size& image_size, int apperture_size = 3); - void release(); GpuMat dx, dy; - GpuMat dx_buf, dy_buf; - GpuMat edgeBuf; - GpuMat trackBuf1, trackBuf2; + GpuMat mag; + GpuMat map; + GpuMat st1, st2; Ptr filterDX, filterDY; }; +CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); + class CV_EXPORTS ImagePyramid { public: diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 3dc0486..b08a61c 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -43,459 +43,463 @@ #if !defined CUDA_DISABLER #include -#include -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" +#include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/utility.hpp" -namespace cv { namespace gpu { namespace device +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace { - namespace canny + struct L1 : binary_function { - __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __device__ __forceinline__ float operator ()(int x, int y) const { - __shared__ int smem[16][18]; - - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows) - { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) - { - smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; - } - __syncthreads(); - - if (j < cols) - { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; - } - } + return ::abs(x) + ::abs(y); } - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __device__ __forceinline__ L1() {} + __device__ __forceinline__ L1(const L1&) {} + }; + struct L2 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + return ::sqrtf(x * x + y * y); + } - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); + __device__ __forceinline__ L2() {} + __device__ __forceinline__ L2(const L2&) {} + }; +} - cudaSafeCall( cudaDeviceSynchronize() ); - } +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} - struct L1 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::abs(x) + ::abs(y); - } - }; - struct L2 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::sqrtf(x * x + y * y); - } - }; +namespace +{ + texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTex + { + const int xoff; + const int yoff; + __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} - template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + __device__ __forceinline__ int operator ()(int y, int x) const { - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; + return tex2D(tex_src, x + xoff, y + yoff); + } + }; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + template __global__ + void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (j < cols) - { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; + if (y >= mag.rows || x >= mag.cols) + return; - sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; - } - __syncthreads(); + int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); + int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); - if (i < rows) - { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; + dx(y, x) = dxVal; + dy(y, x) = dyVal; - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; + mag(y, x) = norm(dxVal, dyVal); + } +} - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); - } - } - } +namespace canny +{ + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + const dim3 block(16, 16); + const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + bindTexture(&tex_src, srcWhole); + SrcTex src(xoff, yoff); - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + if (L2Grad) + { + L2 norm; + ::calcMagnitude<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + ::calcMagnitude<<>>(src, dx, dy, mag, norm); + } - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); + } - template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + if (L2Grad) { - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); + L2 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } - - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) + else { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + L1 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); + } + } +} - if (L2Grad) - calcMagnitude<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); +////////////////////////////////////////////////////////////////////////////////////////// - cudaSafeCall( cudaGetLastError() ); +namespace +{ + texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); - cudaSafeCall( cudaDeviceSynchronize() ); - } + __global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) + { + const int CANNY_SHIFT = 15; + const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols || y >= dx.rows) + return; - __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) - { - __shared__ float smem[18][18]; + int dxVal = dx(y, x); + int dyVal = dy(y, x); - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; + const float m = tex2D(tex_mag, x, y); - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + dxVal = ::abs(dxVal); + dyVal = ::abs(dyVal); - if (ly < 14) - smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + // 0 - the pixel can not belong to an edge + // 1 - the pixel might belong to an edge + // 2 - the pixel does belong to an edge + int edge_type = 0; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + if (m > low_thresh) + { + const int tg22x = dxVal * TG22; + const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); - __syncthreads(); + dyVal <<= CANNY_SHIFT; - if (i < rows && j < cols) + if (dyVal < tg22x) + { + if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + edge_type = 1 + (int)(m > high_thresh); + } + else if(dyVal > tg67x) { - int x = dx.ptr(i)[j]; - int y = dy.ptr(i)[j]; - const int s = (x ^ y) < 0 ? -1 : 1; - const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; + if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + else + { + if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + } - x = ::abs(x); - y = ::abs(y); + map(y, x) = edge_type; + } +} - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; +namespace canny +{ + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + { + const dim3 block(16, 16); + const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); - if (m > low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - - y <<= CANNY_SHIFT; - - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } - } + bindTexture(&tex_mag, mag); - map.ptr(i + 1)[j + 1] = edge_type; - } - } + ::calcMap<<>>(dx, dy, map, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); - #undef CANNY_SHIFT - #undef TG22 + cudaSafeCall( cudaDeviceSynchronize() ); + } +} - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) +////////////////////////////////////////////////////////////////////////////////////////// + +namespace +{ + __device__ int counter = 0; + + __global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st) + { + __shared__ volatile int smem[18][18]; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; + if (threadIdx.y == 0) + smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; + if (threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; + if (threadIdx.x == 0) + smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1) + smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == 0) + smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) + smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; + + __syncthreads(); + + if (x >= map.cols || y >= map.rows) + return; + + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + n = 0; - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - cudaSafeCall( cudaDeviceSynchronize() ); - } + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - ////////////////////////////////////////////////////////////////////////////////////////// + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } - __device__ unsigned int counter = 0; + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; + } - __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) - { - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - __shared__ int smem[18][18]; + map(y, x) = e; - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + n = 0; - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + if (e == 2) + { + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + } - __syncthreads(); + if (n > 0) + { + const int ind = ::atomicAdd(&counter, 1); + st[ind] = make_ushort2(x, y); + } + } +} - if (i < rows && j < cols) - { - int n; +namespace canny +{ + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + const dim3 block(16, 16); + const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + ::edgesHysteresisLocal<<>>(map, st1); + cudaSafeCall( cudaGetLastError() ); - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } + cudaSafeCall( cudaDeviceSynchronize() ); + } +} - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } +////////////////////////////////////////////////////////////////////////////////////////// - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; +namespace +{ + __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - map.ptr(i + 1)[j + 1] = e; + __global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) + { + const int stack_size = 512; - n = 0; + __shared__ int s_counter; + __shared__ int s_ind; + __shared__ ushort2 s_st[stack_size]; - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + if (threadIdx.x == 0) + s_counter = 0; - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + __syncthreads(); - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; - } + int ind = blockIdx.y * gridDim.x + blockIdx.x; - if (n > 0) - { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } - } + if (ind >= count) + return; - #endif - } + ushort2 pos = st1[ind]; - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) + if (threadIdx.x < 8) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) + { + map(pos.y, pos.x) = 2; - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); + ind = Emulation::smem::atomicAdd(&s_counter, 1); - cudaSafeCall( cudaDeviceSynchronize() ); + s_st[ind] = pos; + } } - __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + __syncthreads(); - __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); - const int stack_size = 512; + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; + __syncthreads(); if (threadIdx.x == 0) - s_counter = 0; - __syncthreads(); + s_counter -= portion; - int ind = blockIdx.y * gridDim.x + blockIdx.x; + __syncthreads(); - if (ind < count) + if (subTaskIdx < portion) { - ushort2 pos = st1[ind]; + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) { - if (threadIdx.x < 8) - { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) - { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = ::min(s_counter, blockDim.x >> 3); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - } - - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); - - ind = s_ind; - - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; - } - } + map(pos.y, pos.x) = 2; + + ind = Emulation::smem::atomicAdd(&s_counter, 1); + + s_st[ind] = pos; } } - #endif + __syncthreads(); } - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) + if (s_counter > 0) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - while (count > 0) + if (threadIdx.x == 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(128, 1, 1); - dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); + ind = ::atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; + } - cudaSafeCall( cudaDeviceSynchronize() ); + __syncthreads(); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + ind = s_ind; - std::swap(st1, st2); - } + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + st2[ind + i] = s_st[i]; } + } +} - __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) - { - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; +namespace canny +{ + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) ); - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); - } + int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) + while (count > 0) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + + const dim3 block(128); + const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - getEdges<<>>(map, dst, rows, cols); + ::edgesHysteresisGlobal<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); + + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + std::swap(st1, st2); } - } // namespace canny -}}} // namespace cv { namespace gpu { namespace device + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +namespace +{ + struct GetEdges : unary_function + { + __device__ __forceinline__ uchar operator ()(int e) const + { + return (uchar)(-(e >> 1)); + } + + __device__ __forceinline__ GetEdges() {} + __device__ __forceinline__ GetEdges(const GetEdges&) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace canny +{ + void getEdges(PtrStepSzi map, PtrStepSzb dst) + { + transform(map, dst, GetEdges(), WithOutMask(), 0); + } +} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0bf9c81..b733faf 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -91,7 +91,6 @@ void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_n void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); } -cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); } void cv::gpu::CannyBuf::release() { throw_nogpu(); } @@ -1466,92 +1465,76 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, ////////////////////////////////////////////////////////////////////////////// // Canny -cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_) -{ - CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size()); - - create(dx_.size(), -1); -} - void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size) { - ensureSizeIsEnough(image_size, CV_32SC1, dx); - ensureSizeIsEnough(image_size, CV_32SC1, dy); - - if (apperture_size == 3) + if (apperture_size > 0) { - ensureSizeIsEnough(image_size, CV_32SC1, dx_buf); - ensureSizeIsEnough(image_size, CV_32SC1, dy_buf); - } - else if(apperture_size > 0) - { - if (!filterDX) + ensureSizeIsEnough(image_size, CV_32SC1, dx); + ensureSizeIsEnough(image_size, CV_32SC1, dy); + + if (apperture_size != 3) + { filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE); - if (!filterDY) filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE); + } } - ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf); + ensureSizeIsEnough(image_size, CV_32FC1, mag); + ensureSizeIsEnough(image_size, CV_32SC1, map); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2); } void cv::gpu::CannyBuf::release() { dx.release(); dy.release(); - dx_buf.release(); - dy_buf.release(); - edgeBuf.release(); - trackBuf1.release(); - trackBuf2.release(); + mag.release(); + map.release(); + st1.release(); + st2.release(); } -namespace cv { namespace gpu { namespace device +namespace canny { - namespace canny - { - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols); - - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad); - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad); + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh); + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh); - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols); + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1); - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols); + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2); - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols); - } -}}} + void getEdges(PtrStepSzi map, PtrStepSzb dst); +} namespace { - void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh) + void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh) { - using namespace ::cv::gpu::device::canny; + using namespace canny; - calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh); + calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh); - edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr(), dst.rows, dst.cols); + edgesHysteresisLocal(buf.map, buf.st1.ptr()); - edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr(), buf.trackBuf2.ptr(), dst.rows, dst.cols); + edgesHysteresisGlobal(buf.map, buf.st1.ptr(), buf.st2.ptr()); - getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols); + getEdges(buf.map, dst); } } void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) { - CannyBuf buf(src.size(), apperture_size); + CannyBuf buf; Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient); } void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) { - using namespace ::cv::gpu::device::canny; + using namespace canny; CV_Assert(src.type() == CV_8UC1); @@ -1562,37 +1545,37 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th std::swap( low_thresh, high_thresh); dst.create(src.size(), CV_8U); - dst.setTo(Scalar::all(0)); - buf.create(src.size(), apperture_size); - buf.edgeBuf.setTo(Scalar::all(0)); if (apperture_size == 3) { - calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols); + Size wholeSize; + Point ofs; + src.locateROI(wholeSize, ofs); + GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step); - calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); + calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient); } else { buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows)); buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows)); - calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); + calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient); } - CannyCaller(buf, dst, static_cast(low_thresh), static_cast(high_thresh)); + CannyCaller(buf.dx, buf.dy, buf, dst, static_cast(low_thresh), static_cast(high_thresh)); } void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) { - CannyBuf buf(dx, dy); + CannyBuf buf; Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient); } void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) { - using namespace ::cv::gpu::device::canny; + using namespace canny; CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size()); @@ -1601,17 +1584,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d std::swap( low_thresh, high_thresh); dst.create(dx.size(), CV_8U); - dst.setTo(Scalar::all(0)); - - buf.dx = dx; buf.dy = dy; buf.create(dx.size(), -1); - buf.edgeBuf.setTo(Scalar::all(0)); - calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient); + calcMagnitude(dx, dy, buf.mag, L2gradient); - CannyCaller(buf, dst, static_cast(low_thresh), static_cast(high_thresh)); + CannyCaller(dx, dy, buf, dst, static_cast(low_thresh), static_cast(high_thresh)); } #endif /* !defined (HAVE_CUDA) */ - - diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index e77cad6..71d4a8e 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -313,7 +313,7 @@ TEST_P(Canny, Accuracy) cv::Mat edges_gold; cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient); - EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2); + EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2); } }