static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh,
int aperture_size, bool L2gradient, int cn, const Size & size)
{
- UMat dx(size, CV_16SC(cn)), dy(size, CV_16SC(cn));
+ UMat map;
+
+ const ocl::Device &dev = ocl::Device::getDefault();
+ int max_wg_size = (int)dev.maxWorkGroupSize();
+
+ int lSizeX = 32;
+ int lSizeY = max_wg_size / 32;
+
+ if (lSizeY == 0)
+ {
+ lSizeX = 16;
+ lSizeY = max_wg_size / 16;
+ }
+ if (lSizeY == 0)
+ {
+ lSizeY = 1;
+ }
if (L2gradient)
{
high_thresh *= high_thresh;
}
int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
- Size esize(size.width + 2, size.height + 2);
-
- UMat mag;
- size_t globalsize[2] = { size.width, size.height }, localsize[2] = { 16, 16 };
if (aperture_size == 3 && !_src.isSubmatrix())
{
- // Sobel calculation
- char cvt[2][40];
- ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc,
- format("-D OP_SOBEL -D cn=%d -D shortT=%s -D ucharT=%s"
- " -D convertToIntT=%s -D intT=%s -D convertToShortT=%s", cn,
- ocl::typeToStr(CV_16SC(cn)),
- ocl::typeToStr(CV_8UC(cn)),
- ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
- ocl::typeToStr(CV_32SC(cn)),
- ocl::convertTypeStr(CV_32S, CV_16S, cn, cvt[1])));
- if (calcSobelRowPassKernel.empty())
+ /*
+ stage1_with_sobel:
+ Sobel operator
+ Calc magnitudes
+ Non maxima suppression
+ Double thresholding
+ */
+ char cvt[40];
+ ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc,
+ format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_intN=%s -D intN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
+ cn, ocl::memopTypeToStr(_src.depth()),
+ ocl::convertTypeStr(_src.type(), CV_32SC(cn), cn, cvt),
+ ocl::memopTypeToStr(CV_32SC(cn)),
+ lSizeX, lSizeY,
+ L2gradient ? " -D L2GRAD" : ""));
+ if (with_sobel.empty())
return false;
- UMat src = _src.getUMat(), dxBuf(size, CV_16SC(cn)), dyBuf(size, CV_16SC(cn));
- calcSobelRowPassKernel.args(ocl::KernelArg::ReadOnly(src),
- ocl::KernelArg::WriteOnlyNoSize(dxBuf),
- ocl::KernelArg::WriteOnlyNoSize(dyBuf));
+ UMat src = _src.getUMat();
+ map.create(size, CV_32S);
+ with_sobel.args(ocl::KernelArg::ReadOnly(src),
+ ocl::KernelArg::WriteOnlyNoSize(map),
+ low, high);
- if (!calcSobelRowPassKernel.run(2, globalsize, localsize, false))
- return false;
-
- // magnitude calculation
- ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc,
- format("-D cn=%d%s -D OP_MAG_BUF -D shortT=%s -D convertToIntT=%s -D intT=%s",
- cn, L2gradient ? " -D L2GRAD" : "",
- ocl::typeToStr(CV_16SC(cn)),
- ocl::convertTypeStr(CV_16S, CV_32S, cn, cvt[0]),
- ocl::typeToStr(CV_32SC(cn))));
- if (magnitudeKernel.empty())
- return false;
-
- mag = UMat(esize, CV_32SC1, Scalar::all(0));
- dx.create(size, CV_16SC(cn));
- dy.create(size, CV_16SC(cn));
+ size_t globalsize[2] = { size.width, size.height },
+ localsize[2] = { lSizeX, lSizeY };
- magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf),
- ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy),
- ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);
-
- if (!magnitudeKernel.run(2, globalsize, localsize, false))
+ if (!with_sobel.run(2, globalsize, localsize, false))
return false;
}
else
{
+ /*
+ stage1_without_sobel:
+ Calc magnitudes
+ Non maxima suppression
+ Double thresholding
+ */
+ UMat dx, dy;
Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
- // magnitude calculation
- ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc,
- format("-D OP_MAG -D cn=%d%s -D intT=int -D shortT=short -D convertToIntT=convert_int_sat",
- cn, L2gradient ? " -D L2GRAD" : ""));
- if (magnitudeKernel.empty())
+ ocl::Kernel without_sobel("stage1_without_sobel", ocl::imgproc::canny_oclsrc,
+ format("-D WITHOUT_SOBEL -D cn=%d -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
+ cn, lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : ""));
+ if (without_sobel.empty())
return false;
- mag = UMat(esize, CV_32SC1, Scalar::all(0));
- magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
- ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);
+ map.create(size, CV_32S);
+ without_sobel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
+ ocl::KernelArg::WriteOnly(map),
+ low, high);
+
+ size_t globalsize[2] = { size.width, size.height },
+ localsize[2] = { lSizeX, lSizeY };
- if (!magnitudeKernel.run(2, globalsize, NULL, false))
+ if (!without_sobel.run(2, globalsize, localsize, false))
return false;
}
- // map calculation
- ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc,
- format("-D OP_MAP -D cn=%d", cn));
- if (calcMapKernel.empty())
- return false;
-
- UMat map(esize, CV_32SC1);
- calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
- ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map),
- size.height, size.width, low, high);
-
- if (!calcMapKernel.run(2, globalsize, localsize, false))
- return false;
+ int PIX_PER_WI = 8;
+ /*
+ stage2:
+ hysteresis (add weak edges if they are connected with strong edges)
+ */
- // local hysteresis thresholding
- ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc,
- "-D OP_HYST_LOCAL");
- if (edgesHysteresisLocalKernel.empty())
- return false;
+ ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
+ format("-D STAGE2 -D PIX_PER_WI=%d", PIX_PER_WI));
- UMat stack(1, size.area(), CV_16UC2), counter(1, 1, CV_32SC1, Scalar::all(0));
- edgesHysteresisLocalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
- ocl::KernelArg::PtrReadWrite(counter), size.height, size.width);
- if (!edgesHysteresisLocalKernel.run(2, globalsize, localsize, false))
+ if (edgesHysteresis.empty())
return false;
- // global hysteresis thresholding
- UMat stack2(1, size.area(), CV_16UC2);
- int count;
-
- for ( ; ; )
- {
- ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc,
- "-D OP_HYST_GLOBAL");
- if (edgesHysteresisGlobalKernel.empty())
- return false;
-
- {
- Mat _counter = counter.getMat(ACCESS_RW);
- count = _counter.at<int>(0, 0);
- if (count == 0)
- break;
-
- _counter.at<int>(0, 0) = 0;
- }
-
- edgesHysteresisGlobalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
- ocl::KernelArg::PtrReadWrite(stack2), ocl::KernelArg::PtrReadWrite(counter),
- size.height, size.width, count);
+ edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
-#define divUp(total, grain) ((total + grain - 1) / grain)
- size_t localsize2[2] = { 128, 1 }, globalsize2[2] = { std::min(count, 65535) * 128, divUp(count, 65535) };
-#undef divUp
+ int sizey = lSizeY / PIX_PER_WI;
+ if (sizey == 0)
+ sizey = 1;
- if (!edgesHysteresisGlobalKernel.run(2, globalsize2, localsize2, false))
- return false;
+ size_t globalsize[2] = { size.width, size.height / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
- std::swap(stack, stack2);
- }
+ if (!edgesHysteresis.run(2, globalsize, localsize, false))
+ return false;
// get edges
- ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, "-D OP_EDGES");
+
+ ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc,
+ format("-D GET_EDGES -D PIX_PER_WI=%d", PIX_PER_WI));
if (getEdgesKernel.empty())
return false;
_dst.create(size, CV_8UC1);
UMat dst = _dst.getUMat();
- getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst));
+ getEdgesKernel.args(ocl::KernelArg::ReadOnly(map), ocl::KernelArg::WriteOnlyNoSize(dst));
return getEdgesKernel.run(2, globalsize, NULL, false);
}
//
//M*/
-#ifdef OP_SOBEL
+#define TG22 0.4142135623730950488016887242097f
+#define TG67 2.4142135623730950488016887242097f
-#if cn != 3
-#define loadpix(addr) convertToIntT(*(__global const ucharT *)(addr))
-#define storepix(val, addr) *(__global shortT *)(addr) = convertToShortT(val)
-#define shortSize (int)sizeof(shortT)
+#ifdef WITH_SOBEL
+
+#if cn == 1
+#define loadpix(addr) convert_intN(*(__global const TYPE *)(addr))
#else
-#define loadpix(addr) convertToIntT(vload3(0, (__global const uchar *)(addr)))
-#define storepix(val, addr) vstore3(convertToShortT(val), 0, (__global short *)(addr))
-#define shortSize (int)sizeof(short) * cn
+#define loadpix(addr) convert_intN(vload3(0, (__global const TYPE *)(addr)))
#endif
+#define storepix(value, addr) *(__global int *)(addr) = (int)(value)
+
+/*
+ stage1_with_sobel:
+ Sobel operator
+ Calc magnitudes
+ Non maxima suppression
+ Double thresholding
+*/
+
+__constant int prev[4][2] = {
+ { 0, -1 },
+ { -1, 1 },
+ { -1, 0 },
+ { -1, -1 }
+};
-// Smoothing perpendicular to the derivative direction with a triangle filter
-// only support 3x3 Sobel kernel
-// h (-1) = 1, h (0) = 2, h (1) = 1
-// h'(-1) = -1, h'(0) = 0, h'(1) = 1
-// thus sobel 2D operator can be calculated as:
-// h'(x, y) = h'(x)h(y) for x direction
-//
-// src input 8bit single channel image data
-// dx_buf output dx buffer
-// dy_buf output dy buffer
+__constant int next[4][2] = {
+ { 0, 1 },
+ { 1, -1 },
+ { 1, 0 },
+ { 1, 1 }
+};
-__kernel void calcSobelRowPass(__global const uchar * src, int src_step, int src_offset, int rows, int cols,
- __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
- __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
+inline int3 sobel(int idx, __local const intN *smem)
{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
+ // result: x, y, mag
+ int3 res;
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
+ intN dx = smem[idx + 2] - smem[idx]
+ + 2 * (smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4])
+ + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8];
- __local intT smem[16][18];
+ intN dy = smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]
+ + 2 * (smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9])
+ + smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10];
- smem[lidy][lidx + 1] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(gidx, cn, src_offset)));
- if (lidx == 0)
+#ifdef L2GRAD
+ intN magN = dx * dx + dy * dy;
+#else
+ intN magN = convert_intN(abs(dx) + abs(dy));
+#endif
+#if cn == 1
+ res.z = magN;
+ res.x = dx;
+ res.y = dy;
+#else
+ res.z = max(magN.x, max(magN.y, magN.z));
+ if (res.z == magN.y)
{
- smem[lidy][0] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(max(gidx - 1, 0), cn, src_offset)));
- smem[lidy][17] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(min(gidx + 16, cols - 1), cn, src_offset)));
+ dx.x = dx.y;
+ dy.x = dy.y;
}
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (gidy < rows && gidx < cols)
+ else if (res.z == magN.z)
{
- storepix(smem[lidy][lidx + 2] - smem[lidy][lidx],
- dx_buf + mad24(gidy, dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
- storepix(mad24(2, smem[lidy][lidx + 1], smem[lidy][lidx] + smem[lidy][lidx + 2]),
- dy_buf + mad24(gidy, dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
+ dx.x = dx.z;
+ dy.x = dy.z;
}
-}
-
-#elif defined OP_MAG_BUF || defined OP_MAG
-
-inline intT calc(shortT x, shortT y)
-{
-#ifdef L2GRAD
- intT intx = convertToIntT(x), inty = convertToIntT(y);
- return intx * intx + inty * inty;
-#else
- return convertToIntT( (x >= (shortT)(0) ? x : -x) + (y >= (shortT)(0) ? y : -y) );
+ res.x = dx.x;
+ res.y = dy.x;
#endif
-}
-#ifdef OP_MAG
+ return res;
+}
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the non-buffered version(non-3x3 sobel)
-//
-// dx_buf dx buffer, calculated from calcSobelRowPass
-// dy_buf dy buffer, calculated from calcSobelRowPass
-// dx direvitive in x direction output
-// dy direvitive in y direction output
-// mag magnitude direvitive of xy output
-
-__kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_offset,
- __global const uchar * dyptr, int dy_step, int dy_offset,
- __global uchar * magptr, int mag_step, int mag_offset, int rows, int cols)
+__kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols,
+ __global uchar *map, int map_step, int map_offset,
+ int low_thr, int high_thr)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (y < rows && x < cols)
- {
- int dx_index = mad24(dx_step, y, mad24(x, (int)sizeof(short) * cn, dx_offset));
- int dy_index = mad24(dy_step, y, mad24(x, (int)sizeof(short) * cn, dy_offset));
- int mag_index = mad24(mag_step, y + 1, mad24(x + 1, (int)sizeof(int), mag_offset));
+ __local intN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)];
- __global short * dx = (__global short *)(dxptr + dx_index);
- __global short * dy = (__global short *)(dyptr + dy_index);
- __global int * mag = (__global int *)(magptr + mag_index);
+ int lidx = get_local_id(0);
+ int lidy = get_local_id(1);
- int cmag = calc(dx[0], dy[0]);
-#if cn > 1
- short cx = dx[0], cy = dy[0];
- int pmag;
+ int start_x = GRP_SIZEX * get_group_id(0);
+ int start_y = GRP_SIZEY * get_group_id(1);
- #pragma unroll
- for (int i = 1; i < cn; ++i)
- {
- pmag = calc(dx[i], dy[i]);
- if (pmag > cmag)
- cmag = pmag, cx = dx[i], cy = dy[i];
- }
-
- dx[0] = cx, dy[0] = cy;
-#endif
- mag[0] = cmag;
+ int i = lidx + lidy * GRP_SIZEX;
+ for (int j = i; j < (GRP_SIZEX + 4) * (GRP_SIZEY + 4); j += GRP_SIZEX * GRP_SIZEY)
+ {
+ int x = clamp(start_x - 2 + (j % (GRP_SIZEX + 4)), 0, cols - 1);
+ int y = clamp(start_y - 2 + (j / (GRP_SIZEX + 4)), 0, rows - 1);
+ smem[j] = loadpix(src + mad24(y, src_step, mad24(x, cn * (int)sizeof(TYPE), src_offset)));
}
-}
-#elif defined OP_MAG_BUF
+ barrier(CLK_LOCAL_MEM_FENCE);
-#if cn != 3
-#define loadpix(addr) *(__global const shortT *)(addr)
-#define shortSize (int)sizeof(shortT)
-#else
-#define loadpix(addr) vload3(0, (__global const short *)(addr))
-#define shortSize (int)sizeof(short)*cn
-#endif
+ //// Sobel, Magnitude
+ //
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the buffered version(3x3 sobel)
-//
-// dx_buf dx buffer, calculated from calcSobelRowPass
-// dy_buf dy buffer, calculated from calcSobelRowPass
-// dx direvitive in x direction output
-// dy direvitive in y direction output
-// mag magnitude direvitive of xy output
-__kernel void calcMagnitude_buf(__global const uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
- __global const uchar * dy_buf, int dy_buf_step, int dy_buf_offset,
- __global uchar * dx, int dx_step, int dx_offset,
- __global uchar * dy, int dy_step, int dy_offset,
- __global uchar * mag, int mag_step, int mag_offset, int rows, int cols)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
+ __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
+ lidx++;
+ lidy++;
- __local shortT sdx[18][16];
- __local shortT sdy[18][16];
-
- sdx[lidy + 1][lidx] = loadpix(dx_buf + mad24(min(gidy, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
- sdy[lidy + 1][lidx] = loadpix(dy_buf + mad24(min(gidy, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
- if (lidy == 0)
+ if (i < GRP_SIZEX + 2)
{
- sdx[0][lidx] = loadpix(dx_buf + mad24(clamp(gidy - 1, 0, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
- sdx[17][lidx] = loadpix(dx_buf + mad24(min(gidy + 16, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
-
- sdy[0][lidx] = loadpix(dy_buf + mad24(clamp(gidy - 1, 0, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
- sdy[17][lidx] = loadpix(dy_buf + mad24(min(gidy + 16, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
+ int grp_sizey = min(GRP_SIZEY + 1, rows - start_y);
+ mag[i] = (sobel(i, smem)).z;
+ mag[i + grp_sizey * (GRP_SIZEX + 2)] = (sobel(i + grp_sizey * (GRP_SIZEX + 4), smem)).z;
}
+ if (i < GRP_SIZEY + 2)
+ {
+ int grp_sizex = min(GRP_SIZEX + 1, cols - start_x);
+ mag[i * (GRP_SIZEX + 2)] = (sobel(i * (GRP_SIZEX + 4), smem)).z;
+ mag[i * (GRP_SIZEX + 2) + grp_sizex] = (sobel(i * (GRP_SIZEX + 4) + grp_sizex, smem)).z;
+ }
+
+ int idx = lidx + lidy * (GRP_SIZEX + 4);
+ i = lidx + lidy * (GRP_SIZEX + 2);
+
+ int3 res = sobel(idx, smem);
+ mag[i] = res.z;
+ int x = res.x;
+ int y = res.y;
+
barrier(CLK_LOCAL_MEM_FENCE);
- if (gidx < cols && gidy < rows)
- {
- shortT x = sdx[lidy + 1][lidx] * (shortT)(2) + sdx[lidy][lidx] + sdx[lidy + 2][lidx];
- shortT y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+ //// Threshold + Non maxima suppression
+ //
-#if cn == 1
- *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = x;
- *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = y;
+ /*
+ Sector numbers
- *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = calc(x, y);
-#elif cn == 3
- intT magv = calc(x, y);
- short cx = x.x, cy = y.x;
- int cmag = magv.x;
+ 3 2 1
+ * * *
+ * * *
+ 0*******0
+ * * *
+ * * *
+ 1 2 3
- if (cmag < magv.y)
- cx = x.y, cy = y.y, cmag = magv.y;
- if (cmag < magv.z)
- cx = x.z, cy = y.z, cmag = magv.z;
+ We need to determine arctg(dy / dx) to one of the four directions: 0, 45, 90 or 135 degrees.
+ Therefore if abs(dy / dx) belongs to the interval
+ [0, tg(22.5)] -> 0 direction
+ [tg(22.5), tg(67.5)] -> 1 or 3
+ [tg(67,5), +oo) -> 2
- *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = cx;
- *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = cy;
+ Since tg(67.5) = 1 / tg(22.5), if we take
+ a = abs(dy / dx) * tg(22.5) and b = abs(dy / dx) * tg(67.5)
+ we can get another intervals
- *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = cmag;
-#endif
- }
-}
+ in case a:
+ [0, tg(22.5)^2] -> 0
+ [tg(22.5)^2, 1] -> 1, 3
+ [1, +oo) -> 2
-#endif
+ in case b:
+ [0, 1] -> 0
+ [1, tg(67.5)^2] -> 1,3
+ [tg(67.5)^2, +oo) -> 2
-#elif defined OP_MAP
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// 0.4142135623730950488016887242097 is tan(22.5)
-
-#define CANNY_SHIFT 15
-#define TG22 (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
-
-// First pass of edge detection and non-maximum suppression
-// edgetype is set to for each pixel:
-// 0 - below low thres, not an edge
-// 1 - maybe an edge
-// 2 - is an edge, either magnitude is greater than high thres, or
-// Given estimates of the image gradients, a search is then carried out
-// to determine if the gradient magnitude assumes a local maximum in the gradient direction.
-// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
-// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
-// if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
-// if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
-//
-// dx, dy direvitives of x and y direction
-// mag magnitudes calculated from calcMagnitude function
-// map output containing raw edge types
-
-__kernel void calcMap(__global const uchar * dx, int dx_step, int dx_offset,
- __global const uchar * dy, int dy_step, int dy_offset,
- __global const uchar * mag, int mag_step, int mag_offset,
- __global uchar * map, int map_step, int map_offset,
- int rows, int cols, int low_thresh, int high_thresh)
-{
- __local int smem[18][18];
+ that can help to find direction without conditions.
+
+ 0 - might belong to an edge
+ 1 - pixel doesn't belong to an edge
+ 2 - belong to an edge
+ */
int gidx = get_global_id(0);
int gidy = get_global_id(1);
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
-
- int grp_idx = get_global_id(0) & 0xFFFFF0;
- int grp_idy = get_global_id(1) & 0xFFFFF0;
+ if (gidx >= cols || gidy >= rows)
+ return;
- int tid = mad24(lidy, 16, lidx);
- int lx = tid % 18;
- int ly = tid / 18;
+ int mag0 = mag[i];
- mag += mag_offset;
- if (ly < 14)
- smem[ly][lx] = *(__global const int *)(mag +
- mad24(mag_step, min(grp_idy + ly, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
- if (ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
- smem[ly + 14][lx] = *(__global const int *)(mag +
- mad24(mag_step, min(grp_idy + ly + 14, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (gidy < rows && gidx < cols)
+ int value = 1;
+ if (mag0 > low_thr)
{
- // 0 - the pixel can not belong to an edge
- // 1 - the pixel might belong to an edge
- // 2 - the pixel does belong to an edge
- int edge_type = 0;
- int m = smem[lidy + 1][lidx + 1];
+ int a = (y / (float)x) * TG22;
+ int b = (y / (float)x) * TG67;
- if (m > low_thresh)
- {
- short xs = *(__global const short *)(dx + mad24(gidy, dx_step, mad24(gidx, (int)sizeof(short) * cn, dx_offset)));
- short ys = *(__global const short *)(dy + mad24(gidy, dy_step, mad24(gidx, (int)sizeof(short) * cn, dy_offset)));
- int x = abs(xs), y = abs(ys);
+ a = min((int)abs(a), 1) + 1;
+ b = min((int)abs(b), 1);
- int tg22x = x * TG22;
- y <<= CANNY_SHIFT;
+ // a = { 1, 2 }
+ // b = { 0, 1 }
+ // a * b = { 0, 1, 2 } - directions that we need ( + 3 if x ^ y < 0)
- if (y < tg22x)
- {
- if (m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
- edge_type = 1 + (int)(m > high_thresh);
- }
- else
- {
- int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
- if (y > tg67x)
- {
- if (m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
- edge_type = 1 + (int)(m > high_thresh);
- }
- else
- {
- int s = (xs ^ ys) < 0 ? -1 : 1;
- if (m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
- edge_type = 1 + (int)(m > high_thresh);
- }
- }
+ int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31); // if a = 1, b = 1, dy ^ dx < 0
+ int dir = a * b + 2 * dir3;
+ int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];
+ int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);
+
+ if (mag0 > prev_mag && mag0 >= next_mag)
+ {
+ value = (mag0 > high_thr) ? 2 : 0;
}
- *(__global int *)(map + mad24(map_step, gidy + 1, mad24(gidx + 1, (int)sizeof(int), + map_offset))) = edge_type;
}
+
+ storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));
}
-#undef CANNY_SHIFT
-#undef TG22
+#elif defined WITHOUT_SOBEL
-#elif defined OP_HYST_LOCAL
+/*
+ stage1_without_sobel:
+ Calc magnitudes
+ Non maxima suppression
+ Double thresholding
+*/
-struct PtrStepSz
-{
- __global uchar * ptr;
- int step, rows, cols;
-};
+#define loadpix(addr) (__global short *)(addr)
+#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
-inline int get(struct PtrStepSz data, int y, int x)
-{
- return *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1)));
-}
+#ifdef L2GRAD
+#define dist(x, y) ((int)(x) * (x) + (int)(y) * (y))
+#else
+#define dist(x, y) (abs(x) + abs(y))
+#endif
-inline void set(struct PtrStepSz data, int y, int x, int value)
-{
- *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1))) = value;
-}
+__constant int prev[4][2] = {
+ { 0, -1 },
+ { -1, -1 },
+ { -1, 0 },
+ { -1, 1 }
+};
-// perform Hysteresis for pixel whose edge type is 1
-//
-// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
-// marked as edge. Each thread will iterate for 16 times to connect local edges.
-// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
-// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
-//
-// map raw edge type results calculated from calcMap.
-// stack the potiential edge points found in this kernel call
-// counter the number of potiential edge points
+__constant int next[4][2] = {
+ { 0, 1 },
+ { 1, 1 },
+ { 1, 0 },
+ { 1, -1 }
+};
-__kernel void edgesHysteresisLocal(__global uchar * map_ptr, int map_step, int map_offset,
- __global ushort2 * st, __global unsigned int * counter,
- int rows, int cols)
+__kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int dx_offset,
+ __global const uchar *dyptr, int dy_step, int dy_offset,
+ __global uchar *map, int map_step, int map_offset, int rows, int cols,
+ int low_thr, int high_thr)
{
- struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 };
-
- __local int smem[18][18];
-
- int2 blockIdx = (int2)(get_group_id(0), get_group_id(1));
- int2 blockDim = (int2)(get_local_size(0), get_local_size(1));
- int2 threadIdx = (int2)(get_local_id(0), get_local_id(1));
-
- const int x = blockIdx.x * blockDim.x + threadIdx.x;
- const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
- smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
- if (threadIdx.y == 0)
- smem[0][threadIdx.x + 1] = x < map.cols ? get(map, y - 1, x) : 0;
- if (threadIdx.y == blockDim.y - 1)
- smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
- if (threadIdx.x == 0)
- smem[threadIdx.y + 1][0] = y < map.rows ? get(map, y, x - 1) : 0;
- if (threadIdx.x == blockDim.x - 1)
- smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols && y < map.rows ? get(map, y, x + 1) : 0;
- if (threadIdx.x == 0 && threadIdx.y == 0)
- smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
- if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
- smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0;
- if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
- smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0;
- if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
- smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0;
-
- barrier(CLK_LOCAL_MEM_FENCE);
+ int start_x = get_group_id(0) * GRP_SIZEX;
+ int start_y = get_group_id(1) * GRP_SIZEY;
- if (x >= cols || y >= rows)
- return;
+ int lidx = get_local_id(0);
+ int lidy = get_local_id(1);
- int n;
+ __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
+ __local short2 sigma[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
- #pragma unroll
- for (int k = 0; k < 16; ++k)
+#pragma unroll
+ for (int i = lidx + lidy * GRP_SIZEX; i < (GRP_SIZEX + 2) * (GRP_SIZEY + 2); i += GRP_SIZEX * GRP_SIZEY)
{
- n = 0;
+ int x = clamp(start_x - 1 + i % (GRP_SIZEX + 2), 0, cols - 1);
+ int y = clamp(start_y - 1 + i / (GRP_SIZEX + 2), 0, rows - 1);
- if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
- {
- n += smem[threadIdx.y ][threadIdx.x ] == 2;
- n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
- n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
+ int dx_index = mad24(y, dx_step, mad24(x, cn * (int)sizeof(short), dx_offset));
+ int dy_index = mad24(y, dy_step, mad24(x, cn * (int)sizeof(short), dy_offset));
- n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
- n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+ __global short *dx = loadpix(dxptr + dx_index);
+ __global short *dy = loadpix(dyptr + dy_index);
- n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
- n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
- n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+ int mag0 = dist(dx[0], dy[0]);
+#if cn > 1
+ short cdx = dx[0], cdy = dy[0];
+#pragma unroll
+ for (int j = 1; j < cn; ++j)
+ {
+ int mag1 = dist(dx[j], dy[j]);
+ if (mag1 > mag0)
+ {
+ mag0 = mag1;
+ cdx = dx[j];
+ cdy = dy[j];
+ }
}
-
- if (n > 0)
- smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+ dx[0] = cdx;
+ dy[0] = cdy;
+#endif
+ mag[i] = mag0;
+ sigma[i] = (short2)(dx[0], dy[0]);
}
- const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
- set(map, y, x, e);
- n = 0;
+ barrier(CLK_LOCAL_MEM_FENCE);
- if (e == 2)
- {
- n += smem[threadIdx.y ][threadIdx.x ] == 1;
- n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
- n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
- n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
- n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+ if (gidx >= cols || gidy >= rows)
+ return;
- n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
- n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
- n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
- }
+ lidx++;
+ lidy++;
+
+ int mag0 = mag[lidx + lidy * (GRP_SIZEX + 2)];
+ short x = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).x;
+ short y = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).y;
- if (n > 0)
+ int value = 1;
+ if (mag0 > low_thr)
{
- const int ind = atomic_inc(counter);
- st[ind] = (ushort2)(x + 1, y + 1);
+ int a = (y / (float)x) * TG22;
+ int b = (y / (float)x) * TG67;
+
+ a = min((int)abs(a), 1) + 1;
+ b = min((int)abs(b), 1);
+
+ int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);
+ int dir = a * b + 2 * dir3;
+ int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];
+ int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);
+
+ if (mag0 > prev_mag && mag0 >= next_mag)
+ {
+ value = (mag0 > high_thr) ? 2 : 0;
+ }
}
+
+ storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));
}
-#elif defined OP_HYST_GLOBAL
+#undef TG22
+#undef CANNY_SHIFT
-__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
-__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
+#elif defined STAGE2
+/*
+ stage2:
+ hysteresis (add edges labeled 0 if they are connected with an edge labeled 2)
+*/
+#define loadpix(addr) *(__global int *)(addr)
+#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
+#define l_stack_size 256
+#define p_stack_size 8
-#define stack_size 512
-#define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int))
+__constant short move_dir[2][8] = {
+ { -1, -1, -1, 0, 0, 1, 1, 1 },
+ { -1, 0, 1, -1, 1, -1, 0, 1 }
+};
-__kernel void edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
- __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
- int rows, int cols, int count)
+__kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offset, int rows, int cols)
{
map += map_offset;
- int lidx = get_local_id(0);
+ int x = get_global_id(0);
+ int y0 = get_global_id(1) * PIX_PER_WI;
- int grp_idx = get_group_id(0);
- int grp_idy = get_group_id(1);
+ int lid = get_local_id(0) + get_local_id(1) * 32;
- __local unsigned int s_counter, s_ind;
- __local ushort2 s_st[stack_size];
+ __local ushort2 l_stack[l_stack_size];
+ __local int l_counter;
- if (lidx == 0)
- s_counter = 0;
+ if (lid == 0)
+ l_counter = 0;
barrier(CLK_LOCAL_MEM_FENCE);
- int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
-
- if (ind < count)
+ #pragma unroll
+ for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
{
- ushort2 pos = st1[ind];
- if (lidx < 8)
+ int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int)));
+ if (type == 2)
{
- pos.x += c_dx[lidx];
- pos.y += c_dy[lidx];
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
- {
- *(__global int *)(map + map_index) = 2;
- ind = atomic_inc(&s_counter);
- s_st[ind] = pos;
- }
+ l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
}
- barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
- while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
- {
- const int subTaskIdx = lidx >> 3;
- const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
+ ushort2 p_stack[p_stack_size];
+ int p_counter = 0;
- if (subTaskIdx < portion)
- pos = s_st[s_counter - 1 - subTaskIdx];
- barrier(CLK_LOCAL_MEM_FENCE);
+ while(l_counter != 0)
+ {
+ int mod = l_counter % 64;
+ int pix_per_thr = l_counter / 64 + (lid < mod) ? 1 : 0;
- if (lidx == 0)
- s_counter -= portion;
- barrier(CLK_LOCAL_MEM_FENCE);
+ #pragma unroll
+ for (int i = 0; i < pix_per_thr; ++i)
+ {
+ ushort2 pos = l_stack[ atomic_dec(&l_counter) - 1 ];
- if (subTaskIdx < portion)
+ #pragma unroll
+ for (int j = 0; j < 8; ++j)
{
- pos.x += c_dx[lidx & 7];
- pos.y += c_dy[lidx & 7];
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
+ ushort posx = pos.x + move_dir[0][j];
+ ushort posy = pos.y + move_dir[1][j];
+ if (posx < 0 || posy < 0 || posx >= cols || posy >= rows)
+ continue;
+ __global uchar *addr = map + mad24(posy, map_step, posx * (int)sizeof(int));
+ int type = loadpix(addr);
+ if (type == 0)
{
- *(__global int *)(map + map_index) = 2;
- ind = atomic_inc(&s_counter);
- s_st[ind] = pos;
+ p_stack[p_counter++] = (ushort2)(posx, posy);
+ storepix(2, addr);
}
}
- barrier(CLK_LOCAL_MEM_FENCE);
}
+ barrier(CLK_LOCAL_MEM_FENCE);
- if (s_counter > 0)
+ while (p_counter > 0)
{
- if (lidx == 0)
- {
- ind = atomic_add(counter, s_counter);
- s_ind = ind - s_counter;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- ind = s_ind;
- for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
- st2[ind + i] = s_st[i];
+ l_stack[ atomic_inc(&l_counter) ] = p_stack[--p_counter];
}
+ barrier(CLK_LOCAL_MEM_FENCE);
}
}
-#undef map_index
-#undef stack_size
-
-#elif defined OP_EDGES
+#elif defined GET_EDGES
// Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
-// map edge type mappings
-// dst edge output
+// map edge type mappings
+// dst edge output
-__kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offset,
- __global uchar * dst, int dst_step, int dst_offset, int rows, int cols)
+__kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offset, int rows, int cols,
+ __global uchar *dst, int dst_step, int dst_offset)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y0 = get_global_id(1) * PIX_PER_WI;
- if (y < rows && x < cols)
+ #pragma unroll
+ for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
{
- int map_index = mad24(map_step, y + 1, mad24(x + 1, (int)sizeof(int), map_offset));
- int dst_index = mad24(dst_step, y, x + dst_offset);
+ int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset));
+ int dst_index = mad24(dst_step, y, x) + dst_offset;
__global const int * map = (__global const int *)(mapptr + map_index);
-
dst[dst_index] = (uchar)(-(map[0] >> 1));
}
}
-#endif
+#endif
\ No newline at end of file