From 1b6639aa3d73801179f9694fdec30580cf4cb2ad Mon Sep 17 00:00:00 2001 From: yao Date: Thu, 6 Sep 2012 14:39:20 +0800 Subject: [PATCH] A little optimization on ocl/pyrdown, ocl/canny --- modules/ocl/src/canny.cpp | 14 ++-- modules/ocl/src/kernels/pyr_down.cl | 135 ++++++++++-------------------------- modules/ocl/test/test_canny.cpp | 2 +- 3 files changed, 45 insertions(+), 106 deletions(-) diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp index 5881a4c..4fd5a3f 100644 --- a/modules/ocl/src/canny.cpp +++ b/modules/ocl/src/canny.cpp @@ -66,7 +66,7 @@ namespace cv } } -cv::ocl::CannyBuf::CannyBuf(const oclMat& dx_, const oclMat& dy_) : dx(dx_), dy(dy_) +cv::ocl::CannyBuf::CannyBuf(const oclMat& dx_, const oclMat& dy_) : dx(dx_), dy(dy_), counter(NULL) { CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size()); @@ -102,6 +102,10 @@ void cv::ocl::CannyBuf::create(const Size& image_size, int apperture_size) float counter_f [1] = { 0 }; int err = 0; + if(counter) + { + openCLFree(counter); + } counter = clCreateBuffer( Context::getContext()->impl->clContext, CL_MEM_COPY_HOST_PTR, sizeof(float), counter_f, &err ); openCLSafeCall(err); } @@ -322,15 +326,11 @@ void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int ro args.push_back( make_pair( sizeof(cl_int), (void *)&map.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); -#if CALCMAP_FIXED + size_t globalThreads[3] = {cols, rows, 1}; string kernelName = "calcMap"; size_t localThreads[3] = {16, 16, 1}; -#else - size_t globalThreads[3] = {cols, rows, 1}; - string kernelName = "calcMap_2"; - size_t localThreads[3] = {256, 1, 1}; -#endif + openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } diff --git a/modules/ocl/src/kernels/pyr_down.cl b/modules/ocl/src/kernels/pyr_down.cl index 38b4ec7..19d631e 100644 --- a/modules/ocl/src/kernels/pyr_down.cl +++ b/modules/ocl/src/kernels/pyr_down.cl @@ -16,6 +16,7 @@ // // @Authors // Dachuan Zhao, dachuan@multicorewareinc.com +// Yao Wang, bitwangyaoyao@gmail.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -118,81 +119,19 @@ uchar4 round_uchar4_float4(float4 v) return round_uchar4_int4(iv); } - - - -int idx_row_low(int y, int last_row) -{ - if(y < 0) - { - y = -y; - } - return y % (last_row + 1); -} - -int idx_row_high(int y, int last_row) -{ - int i; - int j; - if(last_row - y < 0) - { - i = (y - last_row); - } - else - { - i = (last_row - y); - } - if(last_row - i < 0) - { - j = i - last_row; - } - else - { - j = last_row - i; - } - return j % (last_row + 1); -} +#define IDX_ROW_HIGH(y,last_row) (abs_diff((int)abs_diff(last_row,y),last_row) % ((last_row)+1)) +#define IDX_ROW_LOW(y,last_row) (abs(y) % ((last_row) + 1)) +#define IDX_COL_HIGH(x,last_col) abs_diff((int)abs_diff(x,last_col),last_col) +#define IDX_COL_LOW(x,last_col) (abs(x) % ((last_col) + 1)) int idx_row(int y, int last_row) { - return idx_row_low(idx_row_high(y, last_row), last_row); -} - -int idx_col_low(int x, int last_col) -{ - if(x < 0) - { - x = -x; - } - return x % (last_col + 1); -} - -int idx_col_high(int x, int last_col) -{ - int i; - int j; - if(last_col - x < 0) - { - i = (x - last_col); - } - else - { - i = (last_col - x); - } - if(last_col - i < 0) - { - j = i - last_col; - } - else - { - j = last_col - i; - } - return j % (last_col + 1); + return IDX_ROW_LOW(IDX_ROW_HIGH(y,last_row),last_row); } int idx_col(int x, int last_col) { - return idx_col_low(idx_col_high(x, last_col), last_col); + return IDX_COL_LOW(IDX_COL_HIGH(x,last_col),last_col); } __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols) @@ -210,11 +149,11 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset sum = 0; - sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]); - sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]); - sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(x, last_col)]); - sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]); - sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]); + sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.375f * ((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]; + sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]; smem[2 + get_local_id(0)] = sum; @@ -224,11 +163,11 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset sum = 0; - sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]); - sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]); - sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(left_x, last_col)]); - sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]); - sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]); + sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.375f * ((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]; + sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]; smem[get_local_id(0)] = sum; } @@ -239,11 +178,11 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset sum = 0; - sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]); - sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]); - sum = sum + 0.375f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(right_x, last_col)]); - sum = sum + 0.25f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]); - sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]); + sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.375f * ((__global uchar*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.25f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]; + sum = sum + 0.0625f * ((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]; smem[4 + get_local_id(0)] = sum; } @@ -288,11 +227,11 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffse sum = 0; - sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)])); - sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)])); - sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(x, last_col)])); - sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)])); - sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)])); + sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]); + sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]); + sum = sum + co1 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(x, last_col)]); + sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]); + sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]); smem[2 + get_local_id(0)] = sum; @@ -302,11 +241,11 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffse sum = 0; - sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); - sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); - sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(left_x, last_col)])); - sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); - sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)])); + sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]); + sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]); + sum = sum + co1 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(left_x, last_col)]); + sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]); + sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]); smem[get_local_id(0)] = sum; } @@ -317,11 +256,11 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffse sum = 0; - sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); - sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); - sum = sum + co1 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(right_x, last_col)])); - sum = sum + co2 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); - sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)])); + sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]); + sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]); + sum = sum + co1 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[idx_col(right_x, last_col)]); + sum = sum + co2 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]); + sum = sum + co3 * convert_float4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]); smem[4 + get_local_id(0)] = sum; } diff --git a/modules/ocl/test/test_canny.cpp b/modules/ocl/test/test_canny.cpp index e6fb885..e728c99 100644 --- a/modules/ocl/test/test_canny.cpp +++ b/modules/ocl/test/test_canny.cpp @@ -103,6 +103,6 @@ TEST_P(Canny, Accuracy) EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2); } -INSTANTIATE_TEST_CASE_P(ocl_ImgProc, Canny, testing::Combine( +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine( testing::Values(AppertureSize(3), AppertureSize(5)), testing::Values(L2gradient(false), L2gradient(true)))); -- 2.7.4