From: Aaron Kunze Date: Mon, 24 Mar 2014 20:35:56 +0000 (-0700) Subject: Optimizations to OpenCL bilateral filter. X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3295^2~5 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b59c517f98fd835297013d783c62da69aea2cffa;p=platform%2Fupstream%2Fopencv.git Optimizations to OpenCL bilateral filter. --- diff --git a/modules/imgproc/src/opencl/bilateral.cl b/modules/imgproc/src/opencl/bilateral.cl index 013be80..ee0f0c7 100644 --- a/modules/imgproc/src/opencl/bilateral.cl +++ b/modules/imgproc/src/opencl/bilateral.cl @@ -54,9 +54,10 @@ #error "cn should be <= 4" #endif +//Read pixels as integers __kernel void bilateral(__global const uchar * src, int src_step, int src_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant float * color_weight, __constant float * space_weight, __constant int * space_ofs) + __constant float * space_weight, __constant int * space_ofs) { int x = get_global_id(0); int y = get_global_id(1); @@ -74,12 +75,69 @@ __kernel void bilateral(__global const uchar * src, int src_step, int src_offset for (int k = 0; k < maxk; k++ ) { int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k])); - uint_t diff = abs(val - val0); - float w = space_weight[k] * color_weight[SUM(diff)]; - sum += convert_float_t(val) * (float_t)(w); + uint diff = (uint)SUM(abs(val - val0)); + float w = space_weight[k] * native_exp((float)(diff * diff * as_float(gauss_color_coeff))); + sum += convert_float_t(val) * (float_t)(w); wsum += w; } storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index); } } + +//Read pixels as floats +__kernel void bilateral_float(__global const uchar * src, int src_step, int src_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * space_weight, __constant int * space_ofs) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (y < dst_rows && x < dst_cols) + { + int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset)); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + + float_t sum = (float_t)(0.0f); + float wsum = 0.0f; + float_t val0 = convert_float_t(loadpix(src + src_index)); + + for (int k = 0; k < maxk; k++ ) + { + float_t val = convert_float_t(loadpix(src + src_index + space_ofs[k])); + float i = SUM(fabs(val - val0)); + float w = space_weight[k] * native_exp(i * i * as_float(gauss_color_coeff)); + sum += val * w; + wsum += w; + } + storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index); + } +} + +//for single channgel x4 sized images. +__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * space_weight, __constant int * space_ofs) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if (y < dst_rows && x < dst_cols / 4 ) + { + int src_index = ((y + radius) * src_step) + x * 4 + (radius + src_offset); + int dst_index = (y * dst_step) + x * 4 + dst_offset ; + float4 sum = 0.f, wsum = 0.f; + float4 val0 = convert_float4(vload4(0, src + src_index)); + #pragma unroll + for (int k = 0; k < maxk; k++ ) + { + float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k])); + float spacew = space_weight[k]; + float4 w = spacew * native_exp((val - val0) * (val - val0) * as_float(gauss_color_coeff)); + sum += val * w; + wsum += w; + } + sum = sum / wsum + .5f; + vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index); + } +} + diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 4318cd1..4f66d48 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -2341,56 +2341,65 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, return false; copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - - std::vector _color_weight(cn * 256); + std::vector _space_weight(d * d); std::vector _space_ofs(d * d); - float * const color_weight = &_color_weight[0]; + float * const space_weight = &_space_weight[0]; int * const space_ofs = &_space_ofs[0]; - // initialize color-related bilateral filter coefficients - for( i = 0; i < 256 * cn; i++ ) - color_weight[i] = (float)std::exp(i * i * gauss_color_coeff); - - // initialize space-related bilateral filter coefficients + // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) for( j = -radius; j <= radius; j++ ) { double r = std::sqrt((double)i * i + (double)j * j); if ( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); - space_ofs[maxk++] = (int)(i * temp.step + j * cn); - } - - char cvt[3][40]; - String cnstr = cn > 1 ? format("%d", cn) : ""; - ocl::Kernel k("bilateral", ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" - " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s", - radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), + continue; + space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); + space_ofs[maxk++] = (int)(i * temp.step + j * cn); + } + + char cvt[3][40]; + String cnstr = cn > 1 ? format("%d", cn) : ""; + String kernelName("bilateral"); + size_t sizeDiv = 1; + if ((ocl::Device::getDefault().isIntel()) && + (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) + { + //Intel GPU + if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. + { + kernelName = "bilateral_float4"; + sizeDiv = 4; + } + else + { + kernelName = "bilateral_float"; + } + } + ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, + format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" + " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=%f", + radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), + ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), + ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), - ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]))); + ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); if (k.empty()) return false; - Mat mcolor_weight(1, cn * 256, CV_32FC1, color_weight); Mat mspace_weight(1, d * d, CV_32FC1, space_weight); Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); UMat ucolor_weight, uspace_weight, uspace_ofs; - mcolor_weight.copyTo(ucolor_weight); + mspace_weight.copyTo(uspace_weight); mspace_ofs.copyTo(uspace_ofs); k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(ucolor_weight), ocl::KernelArg::PtrReadOnly(uspace_weight), ocl::KernelArg::PtrReadOnly(uspace_ofs)); - size_t globalsize[2] = { dst.cols, dst.rows }; + size_t globalsize[2] = { dst.cols / sizeDiv, dst.rows }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index d2f5085..539e11a 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -62,12 +62,14 @@ PARAM_TEST_CASE(FilterTestBase, MatType, Size, // dx, dy BorderType, // border type double, // optional parameter - bool) // roi or not + bool, // roi or not + int) //width multiplier { int type, borderType, ksize; Size size; double param; bool useRoi; + int widthMultiple; TEST_DECLARE_INPUT_PARAMETER(src); TEST_DECLARE_OUTPUT_PARAMETER(dst); @@ -80,6 +82,7 @@ PARAM_TEST_CASE(FilterTestBase, MatType, borderType = GET_PARAM(3); param = GET_PARAM(4); useRoi = GET_PARAM(5); + widthMultiple = GET_PARAM(6); } void random_roi(int minSize = 1) @@ -88,6 +91,9 @@ PARAM_TEST_CASE(FilterTestBase, MatType, minSize = ksize; Size roiSize = randomSize(minSize, MAX_VALUE); + roiSize.width &= ~((widthMultiple * 2) - 1); + roiSize.width += widthMultiple; + Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256); @@ -312,7 +318,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine( Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_ISOLATED, Values(0.0), // not used - Bool())); + Bool(), + Values(1, 4))); OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine( FILTER_TYPES, @@ -320,7 +327,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine( Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(1.0, 0.2, 3.0), // kernel scale - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine( FILTER_TYPES, @@ -328,7 +336,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine( Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), // dx, dy FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(0.0), // not used - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine( FILTER_TYPES, @@ -336,7 +345,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine( Values(Size(0, 1), Size(1, 0)), // dx, dy FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(1.0, 0.2), // kernel scale - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine( FILTER_TYPES, @@ -344,7 +354,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine( Values(Size(0, 0)), // not used FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, Values(0.0), // not used - Bool())); + Bool(), + Values(1))); // not used OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4), @@ -352,7 +363,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine( Values(Size(0,0)),//not used Values((BorderType)BORDER_CONSTANT),//not used Values(1.0, 2.0, 3.0), - Bool())); + Bool(), + Values(1))); // not used + OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4), @@ -360,7 +373,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine( Values(Size(0,0)),//not used Values((BorderType)BORDER_CONSTANT),//not used Values(1.0, 2.0, 3.0), - Bool())); + Bool(), + Values(1))); // not used + OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4), @@ -368,7 +383,10 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine( Values(Size(0, 0), Size(0, 1), Size(0, 2), Size(0, 3), Size(0, 4), Size(0, 5), Size(0, 6)), // used as generator of operations Values((BorderType)BORDER_CONSTANT),// not used Values(1.0, 2.0, 3.0), - Bool())); + Bool(), + Values(1))); // not used + + } } // namespace cvtest::ocl