#if !defined CUDA_DISABLER
#include <utility>
-#include <algorithm>
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/utility.hpp"
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace
{
- namespace canny
+ struct L1 : binary_function<int, int, float>
{
- __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+ __device__ __forceinline__ float operator ()(int x, int y) const
{
- __shared__ int smem[16][18];
-
- const int j = blockIdx.x * blockDim.x + threadIdx.x;
- const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
- if (i < rows)
- {
- smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
- if (threadIdx.x == 0)
- {
- smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
- smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
- }
- __syncthreads();
-
- if (j < cols)
- {
- dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
- dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
- }
- }
+ return ::abs(x) + ::abs(y);
}
- void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+ __device__ __forceinline__ L1() {}
+ __device__ __forceinline__ L1(const L1&) {}
+ };
+ struct L2 : binary_function<int, int, float>
+ {
+ __device__ __forceinline__ float operator ()(int x, int y) const
{
- dim3 block(16, 16, 1);
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+ return ::sqrtf(x * x + y * y);
+ }
- calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
- cudaSafeCall( cudaGetLastError() );
+ __device__ __forceinline__ L2() {}
+ __device__ __forceinline__ L2(const L2&) {}
+ };
+}
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits<L1> : DefaultTransformFunctorTraits<L1>
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits<L2> : DefaultTransformFunctorTraits<L2>
+ {
+ enum { smart_shift = 4 };
+ };
+}}}
- struct L1
- {
- static __device__ __forceinline__ float calc(int x, int y)
- {
- return ::abs(x) + ::abs(y);
- }
- };
- struct L2
- {
- static __device__ __forceinline__ float calc(int x, int y)
- {
- return ::sqrtf(x * x + y * y);
- }
- };
+namespace
+{
+ texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+ struct SrcTex
+ {
+ const int xoff;
+ const int yoff;
+ __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
- template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
- PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+ __device__ __forceinline__ int operator ()(int y, int x) const
{
- __shared__ int sdx[18][16];
- __shared__ int sdy[18][16];
+ return tex2D(tex_src, x + xoff, y + yoff);
+ }
+ };
- const int j = blockIdx.x * blockDim.x + threadIdx.x;
- const int i = blockIdx.y * blockDim.y + threadIdx.y;
+ template <class Norm> __global__
+ void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+ {
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;
- if (j < cols)
- {
- sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
- sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
- if (threadIdx.y == 0)
- {
- sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
- sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
+ if (y >= mag.rows || x >= mag.cols)
+ return;
- sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
- sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
- }
- __syncthreads();
+ int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+ int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
- if (i < rows)
- {
- int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
- int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
+ dx(y, x) = dxVal;
+ dy(y, x) = dyVal;
- dx.ptr(i)[j] = x;
- dy.ptr(i)[j] = y;
+ mag(y, x) = norm(dxVal, dyVal);
+ }
+}
- mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
- }
- }
- }
+namespace canny
+{
+ void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+ {
+ const dim3 block(16, 16);
+ const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
- void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
- {
- dim3 block(16, 16, 1);
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+ bindTexture(&tex_src, srcWhole);
+ SrcTex src(xoff, yoff);
- if (L2Grad)
- calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
- else
- calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+ if (L2Grad)
+ {
+ L2 norm;
+ ::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm);
+ }
+ else
+ {
+ L1 norm;
+ ::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm);
+ }
- cudaSafeCall( cudaGetLastError() );
+ cudaSafeCall( cudaGetLastError() );
- cudaSafeCall(cudaThreadSynchronize());
- }
+ cudaSafeCall(cudaThreadSynchronize());
+ }
- template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+ void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+ {
+ if (L2Grad)
{
- const int j = blockIdx.x * blockDim.x + threadIdx.x;
- const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
- if (i < rows && j < cols)
- mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+ L2 norm;
+ transform(dx, dy, mag, norm, WithOutMask(), 0);
}
-
- void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+ else
{
- dim3 block(16, 16, 1);
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+ L1 norm;
+ transform(dx, dy, mag, norm, WithOutMask(), 0);
+ }
+ }
+}
- if (L2Grad)
- calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
- else
- calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+//////////////////////////////////////////////////////////////////////////////////////////
- cudaSafeCall( cudaGetLastError() );
+namespace
+{
+ texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+ __global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+ {
+ const int CANNY_SHIFT = 15;
+ const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
- //////////////////////////////////////////////////////////////////////////////////////////
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;
- #define CANNY_SHIFT 15
- #define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+ if (x >= dx.cols || y >= dx.rows)
+ return;
- __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
- {
- __shared__ float smem[18][18];
+ int dxVal = dx(y, x);
+ int dyVal = dy(y, x);
- const int j = blockIdx.x * 16 + threadIdx.x;
- const int i = blockIdx.y * 16 + threadIdx.y;
+ const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+ const float m = tex2D(tex_mag, x, y);
- const int tid = threadIdx.y * 16 + threadIdx.x;
- const int lx = tid % 18;
- const int ly = tid / 18;
+ dxVal = ::abs(dxVal);
+ dyVal = ::abs(dyVal);
- if (ly < 14)
- smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+ // 0 - the pixel can not belong to an edge
+ // 1 - the pixel might belong to an edge
+ // 2 - the pixel does belong to an edge
+ int edge_type = 0;
- if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
- smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+ if (m > low_thresh)
+ {
+ const int tg22x = dxVal * TG22;
+ const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
- __syncthreads();
+ dyVal <<= CANNY_SHIFT;
- if (i < rows && j < cols)
+ if (dyVal < tg22x)
+ {
+ if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+ edge_type = 1 + (int)(m > high_thresh);
+ }
+ else if(dyVal > tg67x)
{
- int x = dx.ptr(i)[j];
- int y = dy.ptr(i)[j];
- const int s = (x ^ y) < 0 ? -1 : 1;
- const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
+ if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+ edge_type = 1 + (int)(m > high_thresh);
+ }
+ else
+ {
+ if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+ edge_type = 1 + (int)(m > high_thresh);
+ }
+ }
- x = ::abs(x);
- y = ::abs(y);
+ map(y, x) = edge_type;
+ }
+}
- // 0 - the pixel can not belong to an edge
- // 1 - the pixel might belong to an edge
- // 2 - the pixel does belong to an edge
- int edge_type = 0;
+namespace canny
+{
+ void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+ {
+ const dim3 block(16, 16);
+ const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
- if (m > low_thresh)
- {
- const int tg22x = x * TG22;
- const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
-
- y <<= CANNY_SHIFT;
-
- if (y < tg22x)
- {
- if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
- edge_type = 1 + (int)(m > high_thresh);
- }
- else if( y > tg67x )
- {
- if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
- edge_type = 1 + (int)(m > high_thresh);
- }
- else
- {
- if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
- edge_type = 1 + (int)(m > high_thresh);
- }
- }
+ bindTexture(&tex_mag, mag);
- map.ptr(i + 1)[j + 1] = edge_type;
- }
- }
+ ::calcMap<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+ cudaSafeCall( cudaGetLastError() );
- #undef CANNY_SHIFT
- #undef TG22
+ cudaSafeCall( cudaDeviceSynchronize() );
+ }
+}
- void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace
+{
+ __device__ int counter = 0;
+
+ __global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st)
+ {
+ __shared__ volatile int smem[18][18];
+
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+ if (threadIdx.y == 0)
+ smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+ if (threadIdx.y == blockDim.y - 1)
+ smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+ if (threadIdx.x == 0)
+ smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+ if (threadIdx.x == blockDim.x - 1)
+ smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+ if (threadIdx.x == 0 && threadIdx.y == 0)
+ smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+ if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+ smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+ if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+ smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+ if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+ smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+
+ __syncthreads();
+
+ if (x >= map.cols || y >= map.rows)
+ return;
+
+ int n;
+
+ #pragma unroll
+ for (int k = 0; k < 16; ++k)
{
- dim3 block(16, 16, 1);
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+ n = 0;
- calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
- cudaSafeCall( cudaGetLastError() );
+ if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+ {
+ n += smem[threadIdx.y ][threadIdx.x ] == 2;
+ n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
+ n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+ n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
+ n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
- //////////////////////////////////////////////////////////////////////////////////////////
+ n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
+ n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+ n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+ }
- __device__ unsigned int counter = 0;
+ if (n > 0)
+ smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+ }
- __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
- {
- #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
+ const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
- __shared__ int smem[18][18];
+ map(y, x) = e;
- const int j = blockIdx.x * 16 + threadIdx.x;
- const int i = blockIdx.y * 16 + threadIdx.y;
+ n = 0;
- const int tid = threadIdx.y * 16 + threadIdx.x;
- const int lx = tid % 18;
- const int ly = tid / 18;
+ if (e == 2)
+ {
+ n += smem[threadIdx.y ][threadIdx.x ] == 1;
+ n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
+ n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
- if (ly < 14)
- smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+ n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
+ n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
- if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
- smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+ n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
+ n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+ n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+ }
- __syncthreads();
+ if (n > 0)
+ {
+ const int ind = ::atomicAdd(&counter, 1);
+ st[ind] = make_ushort2(x, y);
+ }
+ }
+}
- if (i < rows && j < cols)
- {
- int n;
+namespace canny
+{
+ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+ {
+ void* counter_ptr;
+ cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
- #pragma unroll
- for (int k = 0; k < 16; ++k)
- {
- n = 0;
+ cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
- if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
- {
- n += smem[threadIdx.y ][threadIdx.x ] == 2;
- n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
- n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
+ const dim3 block(16, 16);
+ const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
- n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
- n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+ ::edgesHysteresisLocal<<<grid, block>>>(map, st1);
+ cudaSafeCall( cudaGetLastError() );
- n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
- n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
- n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
- }
+ cudaSafeCall( cudaDeviceSynchronize() );
+ }
+}
- if (n > 0)
- smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
- }
+//////////////////////////////////////////////////////////////////////////////////////////
- const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+namespace
+{
+ __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
+ __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
- map.ptr(i + 1)[j + 1] = e;
+ __global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+ {
+ const int stack_size = 512;
- n = 0;
+ __shared__ int s_counter;
+ __shared__ int s_ind;
+ __shared__ ushort2 s_st[stack_size];
- if (e == 2)
- {
- n += smem[threadIdx.y ][threadIdx.x ] == 1;
- n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
- n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
+ if (threadIdx.x == 0)
+ s_counter = 0;
- n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
- n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+ __syncthreads();
- n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
- n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
- n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
- }
+ int ind = blockIdx.y * gridDim.x + blockIdx.x;
- if (n > 0)
- {
- const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
- st[ind] = make_ushort2(j + 1, i + 1);
- }
- }
+ if (ind >= count)
+ return;
- #endif
- }
+ ushort2 pos = st1[ind];
- void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+ if (threadIdx.x < 8)
{
- void* counter_ptr;
- cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
- cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+ pos.x += c_dx[threadIdx.x];
+ pos.y += c_dy[threadIdx.x];
- dim3 block(16, 16, 1);
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+ if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1)
+ {
+ map(pos.y, pos.x) = 2;
- edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
- cudaSafeCall( cudaGetLastError() );
+ ind = Emulation::smem::atomicAdd(&s_counter, 1);
- cudaSafeCall( cudaDeviceSynchronize() );
+ s_st[ind] = pos;
+ }
}
- __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
- __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
+ __syncthreads();
- __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+ while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
+ const int subTaskIdx = threadIdx.x >> 3;
+ const int portion = ::min(s_counter, blockDim.x >> 3);
- const int stack_size = 512;
+ if (subTaskIdx < portion)
+ pos = s_st[s_counter - 1 - subTaskIdx];
- __shared__ unsigned int s_counter;
- __shared__ unsigned int s_ind;
- __shared__ ushort2 s_st[stack_size];
+ __syncthreads();
if (threadIdx.x == 0)
- s_counter = 0;
- __syncthreads();
+ s_counter -= portion;
- int ind = blockIdx.y * gridDim.x + blockIdx.x;
+ __syncthreads();
- if (ind < count)
+ if (subTaskIdx < portion)
{
- ushort2 pos = st1[ind];
+ pos.x += c_dx[threadIdx.x & 7];
+ pos.y += c_dy[threadIdx.x & 7];
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+ if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1)
{
- if (threadIdx.x < 8)
- {
- pos.x += c_dx[threadIdx.x];
- pos.y += c_dy[threadIdx.x];
-
- if (map.ptr(pos.y)[pos.x] == 1)
- {
- map.ptr(pos.y)[pos.x] = 2;
-
- ind = atomicInc(&s_counter, (unsigned int)(-1));
-
- s_st[ind] = pos;
- }
- }
- __syncthreads();
-
- while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
- {
- const int subTaskIdx = threadIdx.x >> 3;
- const int portion = ::min(s_counter, blockDim.x >> 3);
-
- pos.x = pos.y = 0;
-
- if (subTaskIdx < portion)
- pos = s_st[s_counter - 1 - subTaskIdx];
- __syncthreads();
-
- if (threadIdx.x == 0)
- s_counter -= portion;
- __syncthreads();
-
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
- {
- pos.x += c_dx[threadIdx.x & 7];
- pos.y += c_dy[threadIdx.x & 7];
-
- if (map.ptr(pos.y)[pos.x] == 1)
- {
- map.ptr(pos.y)[pos.x] = 2;
-
- ind = atomicInc(&s_counter, (unsigned int)(-1));
-
- s_st[ind] = pos;
- }
- }
- __syncthreads();
- }
-
- if (s_counter > 0)
- {
- if (threadIdx.x == 0)
- {
- ind = atomicAdd(&counter, s_counter);
- s_ind = ind - s_counter;
- }
- __syncthreads();
-
- ind = s_ind;
-
- for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
- {
- st2[ind + i] = s_st[i];
- }
- }
+ map(pos.y, pos.x) = 2;
+
+ ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+ s_st[ind] = pos;
}
}
- #endif
+ __syncthreads();
}
- void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+ if (s_counter > 0)
{
- void* counter_ptr;
- cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
- unsigned int count;
- cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
- while (count > 0)
+ if (threadIdx.x == 0)
{
- cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
-
- dim3 block(128, 1, 1);
- dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
- edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
- cudaSafeCall( cudaGetLastError() );
+ ind = ::atomicAdd(&counter, s_counter);
+ s_ind = ind - s_counter;
+ }
- cudaSafeCall( cudaDeviceSynchronize() );
+ __syncthreads();
- cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+ ind = s_ind;
- std::swap(st1, st2);
- }
+ for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+ st2[ind + i] = s_st[i];
}
+ }
+}
- __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
- {
- const int j = blockIdx.x * 16 + threadIdx.x;
- const int i = blockIdx.y * 16 + threadIdx.y;
+namespace canny
+{
+ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+ {
+ void* counter_ptr;
+ cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) );
- if (i < rows && j < cols)
- dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
- }
+ int count;
+ cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
- void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+ while (count > 0)
{
- dim3 block(16, 16, 1);
- dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+ cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+ const dim3 block(128);
+ const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
- getEdges<<<grid, block>>>(map, dst, rows, cols);
+ ::edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
+
+ cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+ std::swap(st1, st2);
}
- } // namespace canny
-}}} // namespace cv { namespace gpu { namespace device
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+namespace
+{
+ struct GetEdges : unary_function<int, uchar>
+ {
+ __device__ __forceinline__ uchar operator ()(int e) const
+ {
+ return (uchar)(-(e >> 1));
+ }
+
+ __device__ __forceinline__ GetEdges() {}
+ __device__ __forceinline__ GetEdges(const GetEdges&) {}
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits<GetEdges> : DefaultTransformFunctorTraits<GetEdges>
+ {
+ enum { smart_shift = 4 };
+ };
+}}}
+
+namespace canny
+{
+ void getEdges(PtrStepSzi map, PtrStepSzb dst)
+ {
+ transform(map, dst, GetEdges(), WithOutMask(), 0);
+ }
+}
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
-cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
void cv::gpu::CannyBuf::release() { throw_nogpu(); }
//////////////////////////////////////////////////////////////////////////////
// Canny
-cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
-{
- CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
-
- create(dx_.size(), -1);
-}
-
void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
{
- ensureSizeIsEnough(image_size, CV_32SC1, dx);
- ensureSizeIsEnough(image_size, CV_32SC1, dy);
-
- if (apperture_size == 3)
+ if (apperture_size > 0)
{
- ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
- ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
- }
- else if(apperture_size > 0)
- {
- if (!filterDX)
+ ensureSizeIsEnough(image_size, CV_32SC1, dx);
+ ensureSizeIsEnough(image_size, CV_32SC1, dy);
+
+ if (apperture_size != 3)
+ {
filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
- if (!filterDY)
filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+ }
}
- ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
+ ensureSizeIsEnough(image_size, CV_32FC1, mag);
+ ensureSizeIsEnough(image_size, CV_32SC1, map);
- ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
- ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
+ ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
+ ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
}
void cv::gpu::CannyBuf::release()
{
dx.release();
dy.release();
- dx_buf.release();
- dy_buf.release();
- edgeBuf.release();
- trackBuf1.release();
- trackBuf2.release();
+ mag.release();
+ map.release();
+ st1.release();
+ st2.release();
}
-namespace cv { namespace gpu { namespace device
+namespace canny
{
- namespace canny
- {
- void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
-
- void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
- void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+ void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+ void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
- void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
+ void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
- void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
+ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
- void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
+ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
- void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
- }
-}}}
+ void getEdges(PtrStepSzi map, PtrStepSzb dst);
+}
namespace
{
- void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+ void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
{
- using namespace ::cv::gpu::device::canny;
+ using namespace canny;
- calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+ calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
- edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
+ edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
- edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
+ edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
- getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+ getEdges(buf.map, dst);
}
}
void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{
- CannyBuf buf(src.size(), apperture_size);
+ CannyBuf buf;
Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
}
void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{
- using namespace ::cv::gpu::device::canny;
+ using namespace canny;
CV_Assert(src.type() == CV_8UC1);
std::swap( low_thresh, high_thresh);
dst.create(src.size(), CV_8U);
- dst.setTo(Scalar::all(0));
-
buf.create(src.size(), apperture_size);
- buf.edgeBuf.setTo(Scalar::all(0));
if (apperture_size == 3)
{
- calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
+ Size wholeSize;
+ Point ofs;
+ src.locateROI(wholeSize, ofs);
+ GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
- calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+ calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
}
else
{
buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
- calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+ calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
}
- CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+ CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
}
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{
- CannyBuf buf(dx, dy);
+ CannyBuf buf;
Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
}
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{
- using namespace ::cv::gpu::device::canny;
+ using namespace canny;
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
std::swap( low_thresh, high_thresh);
dst.create(dx.size(), CV_8U);
- dst.setTo(Scalar::all(0));
-
- buf.dx = dx; buf.dy = dy;
buf.create(dx.size(), -1);
- buf.edgeBuf.setTo(Scalar::all(0));
- calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+ calcMagnitude(dx, dy, buf.mag, L2gradient);
- CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+ CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
}
#endif /* !defined (HAVE_CUDA) */
-
-