From: peng xiao Date: Wed, 29 May 2013 06:15:26 +0000 (+0800) Subject: Fix 2.4 ocl Canny. X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1314^2~1283^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d015fa76fab95483ec7cdf0b44796c4caf70c31c;p=platform%2Fupstream%2Fopencv.git Fix 2.4 ocl Canny. This fix is a workaround for current 2.4 branch without introducing an additional oclMat buffer into CannyBuf object. Test case is cleaned up. Volatile keywords in kernels are removed for performance concern. --- diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp index cc7e60e..82bb01b 100644 --- a/modules/ocl/src/canny.cpp +++ b/modules/ocl/src/canny.cpp @@ -87,7 +87,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size) filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE); } } - ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf); + ensureSizeIsEnough(2 * (image_size.height + 2), image_size.width + 2, CV_32FC1, edgeBuf); ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1); ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2); @@ -141,13 +141,16 @@ namespace void CannyCaller(CannyBuf &buf, oclMat &dst, float low_thresh, float high_thresh) { using namespace ::cv::ocl::canny; - calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh); + oclMat magBuf = buf.edgeBuf(Rect(0, 0, buf.edgeBuf.cols, buf.edgeBuf.rows / 2)); + oclMat mapBuf = buf.edgeBuf(Rect(0, buf.edgeBuf.rows / 2, buf.edgeBuf.cols, buf.edgeBuf.rows / 2)); - edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1, buf.counter, dst.rows, dst.cols); + calcMap_gpu(buf.dx, buf.dy, magBuf, mapBuf, dst.rows, dst.cols, low_thresh, high_thresh); - edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1, buf.trackBuf2, buf.counter, dst.rows, dst.cols); + edgesHysteresisLocal_gpu(mapBuf, buf.trackBuf1, buf.counter, dst.rows, dst.cols); - getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols); + edgesHysteresisGlobal_gpu(mapBuf, buf.trackBuf1, buf.trackBuf2, buf.counter, dst.rows, dst.cols); + + getEdges_gpu(mapBuf, dst, dst.rows, dst.cols); } } @@ -172,18 +175,20 @@ void cv::ocl::Canny(const oclMat &src, CannyBuf &buf, oclMat &dst, double low_th buf.create(src.size(), apperture_size); buf.edgeBuf.setTo(Scalar::all(0)); + oclMat magBuf = buf.edgeBuf(Rect(0, 0, buf.edgeBuf.cols, buf.edgeBuf.rows / 2)); + if (apperture_size == 3) { calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols); - calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); + calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, magBuf, src.rows, src.cols, L2gradient); } else { buf.filterDX->apply(src, buf.dx); buf.filterDY->apply(src, buf.dy); - calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); + calcMagnitude_gpu(buf.dx, buf.dy, magBuf, src.rows, src.cols, L2gradient); } CannyCaller(buf, dst, static_cast(low_thresh), static_cast(high_thresh)); } @@ -209,7 +214,10 @@ void cv::ocl::Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &d buf.dy = dy; buf.create(dx.size(), -1); buf.edgeBuf.setTo(Scalar::all(0)); - calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient); + + oclMat magBuf = buf.edgeBuf(Rect(0, 0, buf.edgeBuf.cols, buf.edgeBuf.rows / 2)); + + calcMagnitude_gpu(buf.dx, buf.dy, magBuf, dx.rows, dx.cols, L2gradient); CannyCaller(buf, dst, static_cast(low_thresh), static_cast(high_thresh)); } @@ -234,7 +242,7 @@ void canny::calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_b size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad) @@ -264,12 +272,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - char build_options [15] = ""; - if(L2Grad) - { - strcat(build_options, "-D L2GRAD"); - } - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options); + const char * build_options = L2Grad ? "-D L2GRAD":""; + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options); } void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad) { @@ -292,12 +296,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, i size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - char build_options [15] = ""; - if(L2Grad) - { - strcat(build_options, "-D L2GRAD"); - } - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options); + const char * build_options = L2Grad ? "-D L2GRAD":""; + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options); } void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh) @@ -328,7 +328,7 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro string kernelName = "calcMap"; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols) @@ -348,7 +348,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols) @@ -378,7 +378,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi args.push_back( make_pair( sizeof(cl_int), (void *)&map.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE); + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); std::swap(st1, st2); } @@ -403,5 +403,5 @@ void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols) size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } diff --git a/modules/ocl/src/opencl/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl index ceaaed1..5402759 100644 --- a/modules/ocl/src/opencl/imgproc_canny.cl +++ b/modules/ocl/src/opencl/imgproc_canny.cl @@ -297,6 +297,9 @@ calcMap map_step /= sizeof(*map); map_offset /= sizeof(*map); + mag += mag_offset; + map += map_offset; + __local float smem[18][18]; int gidx = get_global_id(0); @@ -389,7 +392,7 @@ edgesHysteresisLocal ( __global int * map, __global ushort2 * st, - volatile __global unsigned int * counter, + __global unsigned int * counter, int rows, int cols, int map_step, @@ -399,6 +402,8 @@ edgesHysteresisLocal map_step /= sizeof(*map); map_offset /= sizeof(*map); + map += map_offset; + __local int smem[18][18]; int gidx = get_global_id(0); @@ -416,12 +421,12 @@ edgesHysteresisLocal if(ly < 14) { smem[ly][lx] = - map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step + map_offset]; + map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step]; } if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols) { smem[ly + 14][lx] = - map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step + map_offset]; + map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step]; } barrier(CLK_LOCAL_MEM_FENCE); @@ -482,14 +487,17 @@ edgesHysteresisLocal __constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; __constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + #define stack_size 512 __kernel -void edgesHysteresisGlobal +void +__attribute__((reqd_work_group_size(128,1,1))) +edgesHysteresisGlobal ( __global int * map, __global ushort2 * st1, __global ushort2 * st2, - volatile __global int * counter, + __global int * counter, int rows, int cols, int count, @@ -501,6 +509,8 @@ void edgesHysteresisGlobal map_step /= sizeof(*map); map_offset /= sizeof(*map); + map += map_offset; + int gidx = get_global_id(0); int gidy = get_global_id(1); @@ -510,7 +520,7 @@ void edgesHysteresisGlobal int grp_idx = get_group_id(0); int grp_idy = get_group_id(1); - volatile __local unsigned int s_counter; + __local unsigned int s_counter; __local unsigned int s_ind; __local ushort2 s_st[stack_size]; @@ -564,9 +574,9 @@ void edgesHysteresisGlobal pos.x += c_dx[lidx & 7]; pos.y += c_dy[lidx & 7]; - if (map[pos.x + map_offset + pos.y * map_step] == 1) + if (map[pos.x + pos.y * map_step] == 1) { - map[pos.x + map_offset + pos.y * map_step] = 2; + map[pos.x + pos.y * map_step] = 2; ind = atomic_inc(&s_counter); @@ -621,6 +631,6 @@ void getEdges if(gidy < rows && gidx < cols) { - dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] >> 1)); + dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1)); } } diff --git a/modules/ocl/test/test_canny.cpp b/modules/ocl/test/test_canny.cpp index cac6b66..10032e8 100644 --- a/modules/ocl/test/test_canny.cpp +++ b/modules/ocl/test/test_canny.cpp @@ -45,7 +45,6 @@ #include "precomp.hpp" #ifdef HAVE_OPENCL -#define SHOW_RESULT 0 //////////////////////////////////////////////////////// // Canny @@ -59,13 +58,10 @@ PARAM_TEST_CASE(Canny, AppertureSize, L2gradient) bool useL2gradient; cv::Mat edges_gold; - //std::vector oclinfo; virtual void SetUp() { apperture_size = GET_PARAM(0); useL2gradient = GET_PARAM(1); - //int devnums = getDevice(oclinfo); - //CV_Assert(devnums > 0); } }; @@ -77,32 +73,18 @@ TEST_P(Canny, Accuracy) double low_thresh = 50.0; double high_thresh = 100.0; - cv::resize(img, img, cv::Size(512, 384)); cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img); cv::ocl::oclMat edges; cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient); - char filename [100]; - sprintf(filename, "G:/Valve_edges_a%d_L2Grad%d.jpg", apperture_size, (int)useL2gradient); - cv::Mat edges_gold; cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient); -#if SHOW_RESULT - cv::Mat edges_x2, ocl_edges(edges); - edges_x2.create(edges.rows, edges.cols * 2, edges.type()); - edges_x2.setTo(0); - cv::add(edges_gold, cv::Mat(edges_x2, cv::Rect(0, 0, edges_gold.cols, edges_gold.rows)), cv::Mat(edges_x2, cv::Rect(0, 0, edges_gold.cols, edges_gold.rows))); - cv::add(ocl_edges, cv::Mat(edges_x2, cv::Rect(edges_gold.cols, 0, edges_gold.cols, edges_gold.rows)), cv::Mat(edges_x2, cv::Rect(edges_gold.cols, 0, edges_gold.cols, edges_gold.rows))); - cv::namedWindow("Canny result (left: cpu, right: ocl)"); - cv::imshow("Canny result (left: cpu, right: ocl)", edges_x2); - cv::waitKey(); -#endif //OUTPUT_RESULT EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Canny, testing::Combine( testing::Values(AppertureSize(3), AppertureSize(5)), testing::Values(L2gradient(false), L2gradient(true)))); #endif \ No newline at end of file