Optimizations to OpenCL bilateral filter.
authorAaron Kunze <aaron.kunze@intel.com>
Mon, 24 Mar 2014 20:35:56 +0000 (13:35 -0700)
committerunknown <girish.ravunnikutty@intel.com>
Thu, 10 Apr 2014 19:15:13 +0000 (12:15 -0700)
modules/imgproc/src/opencl/bilateral.cl
modules/imgproc/src/smooth.cpp
modules/imgproc/test/ocl/test_filters.cpp

index 013be80..ee0f0c7 100644 (file)
 #error "cn should be <= 4"
 #endif
 
+//Read pixels as integers
 __kernel void bilateral(__global const uchar * src, int src_step, int src_offset,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                        __constant float * color_weight, __constant float * space_weight, __constant int * space_ofs)
+                        __constant float * space_weight, __constant int * space_ofs)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -74,12 +75,69 @@ __kernel void bilateral(__global const uchar * src, int src_step, int src_offset
         for (int k = 0; k < maxk; k++ )
         {
             int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k]));
-            uint_t diff = abs(val - val0);
-            float w = space_weight[k] * color_weight[SUM(diff)];
-            sum += convert_float_t(val) * (float_t)(w);
+            uint diff = (uint)SUM(abs(val - val0));
+            float w = space_weight[k] * native_exp((float)(diff * diff * as_float(gauss_color_coeff)));
+                       sum += convert_float_t(val) * (float_t)(w);
             wsum += w;
         }
 
         storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);
     }
 }
+
+//Read pixels as floats
+__kernel void bilateral_float(__global const uchar * src, int src_step, int src_offset,
+                        __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                        __constant float * space_weight, __constant int * space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
+
+        float_t sum = (float_t)(0.0f);
+        float wsum = 0.0f;
+        float_t val0 = convert_float_t(loadpix(src + src_index));
+
+        for (int k = 0; k < maxk; k++ )
+        {
+            float_t val = convert_float_t(loadpix(src + src_index + space_ofs[k]));
+            float i = SUM(fabs(val - val0));
+            float w = space_weight[k] * native_exp(i * i * as_float(gauss_color_coeff));
+                       sum += val * w;
+            wsum += w;
+        }
+        storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);
+    }
+}
+
+//for single channgel x4 sized images.
+__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset,
+                               __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                               __constant float * space_weight, __constant int * space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if (y < dst_rows && x < dst_cols / 4 )
+    {
+        int src_index = ((y + radius) * src_step) + x * 4  + (radius + src_offset);
+        int dst_index = (y  * dst_step) +  x * 4 + dst_offset ;
+        float4 sum = 0.f, wsum = 0.f;
+        float4 val0 = convert_float4(vload4(0, src + src_index));
+        #pragma unroll
+        for (int k = 0; k < maxk; k++ )
+        {
+            float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k]));
+            float spacew = space_weight[k];
+            float4 w = spacew * native_exp((val - val0) * (val - val0) * as_float(gauss_color_coeff));
+            sum += val * w;
+            wsum += w;
+        }
+        sum = sum / wsum + .5f;
+        vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index);
+    }
+}
+
index 4318cd1..4f66d48 100644 (file)
@@ -2341,56 +2341,65 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
         return false;
 
     copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
-
-    std::vector<float> _color_weight(cn * 256);
+           
     std::vector<float> _space_weight(d * d);
     std::vector<int> _space_ofs(d * d);
-    float * const color_weight = &_color_weight[0];
+    
     float * const space_weight = &_space_weight[0];
     int * const space_ofs = &_space_ofs[0];
 
-    // initialize color-related bilateral filter coefficients
-    for( i = 0; i < 256 * cn; i++ )
-        color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
-
-    // initialize space-related bilateral filter coefficients
+           // initialize space-related bilateral filter coefficients
     for( i = -radius, maxk = 0; i <= radius; i++ )
         for( j = -radius; j <= radius; j++ )
         {
             double r = std::sqrt((double)i * i + (double)j * j);
             if ( r > radius )
-                continue;
-            space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
-            space_ofs[maxk++] = (int)(i * temp.step + j * cn);
-        }
-
-    char cvt[3][40];
-    String cnstr = cn > 1 ? format("%d", cn) : "";
-    ocl::Kernel k("bilateral", ocl::imgproc::bilateral_oclsrc,
-                  format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
-                         " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s",
-                         radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
-                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
-                         ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
+                               continue;
+                       space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
+                       space_ofs[maxk++] = (int)(i * temp.step + j * cn);
+               }
+
+               char cvt[3][40];
+               String cnstr = cn > 1 ? format("%d", cn) : "";
+               String kernelName("bilateral");
+               size_t sizeDiv = 1;
+               if ((ocl::Device::getDefault().isIntel()) &&
+                       (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
+               {
+                       //Intel GPU
+                       if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
+                       {
+                               kernelName = "bilateral_float4";
+                               sizeDiv = 4;
+                       }
+                       else
+                       {
+                               kernelName = "bilateral_float";
+                       }
+               }
+               ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
+                       format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
+                       " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=%f",
+                       radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
+                       ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
+                       ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
                          ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
-                         ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2])));
+                         ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
     if (k.empty())
         return false;
 
-    Mat mcolor_weight(1, cn * 256, CV_32FC1, color_weight);
     Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
     Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
     UMat ucolor_weight, uspace_weight, uspace_ofs;
-    mcolor_weight.copyTo(ucolor_weight);
+
     mspace_weight.copyTo(uspace_weight);
     mspace_ofs.copyTo(uspace_ofs);
 
     k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
-           ocl::KernelArg::PtrReadOnly(ucolor_weight),
            ocl::KernelArg::PtrReadOnly(uspace_weight),
            ocl::KernelArg::PtrReadOnly(uspace_ofs));
 
-    size_t globalsize[2] = { dst.cols, dst.rows };
+    size_t globalsize[2] = { dst.cols / sizeDiv, dst.rows };
     return k.run(2, globalsize, NULL, false);
 }
 
index d2f5085..539e11a 100644 (file)
@@ -62,12 +62,14 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
                 Size, // dx, dy
                 BorderType, // border type
                 double, // optional parameter
-                bool) // roi or not
+                bool, // roi or not
+                int)  //width multiplier
 {
     int type, borderType, ksize;
     Size size;
     double param;
     bool useRoi;
+    int widthMultiple;
 
     TEST_DECLARE_INPUT_PARAMETER(src);
     TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -80,6 +82,7 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
         borderType = GET_PARAM(3);
         param = GET_PARAM(4);
         useRoi = GET_PARAM(5);
+        widthMultiple = GET_PARAM(6);
     }
 
     void random_roi(int minSize = 1)
@@ -88,6 +91,9 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
             minSize = ksize;
 
         Size roiSize = randomSize(minSize, MAX_VALUE);
+        roiSize.width &= ~((widthMultiple * 2) - 1);
+        roiSize.width += widthMultiple;
+
         Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
 
@@ -312,7 +318,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
                             Values(Size(0, 0)), // not used
                             FILTER_BORDER_SET_NO_ISOLATED,
                             Values(0.0), // not used
-                            Bool()));
+                            Bool(),
+                            Values(1, 4)));
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
                             FILTER_TYPES,
@@ -320,7 +327,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
                             Values(Size(0, 0)), // not used
                             FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
                             Values(1.0, 0.2, 3.0), // kernel scale
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine(
                             FILTER_TYPES,
@@ -328,7 +336,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine(
                             Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), // dx, dy
                             FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
                             Values(0.0), // not used
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine(
                             FILTER_TYPES,
@@ -336,7 +345,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine(
                             Values(Size(0, 1), Size(1, 0)), // dx, dy
                             FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
                             Values(1.0, 0.2), // kernel scale
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
                             FILTER_TYPES,
@@ -344,7 +354,8 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
                             Values(Size(0, 0)), // not used
                             FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
                             Values(0.0), // not used
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
@@ -352,7 +363,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
                             Values(Size(0,0)),//not used
                             Values((BorderType)BORDER_CONSTANT),//not used
                             Values(1.0, 2.0, 3.0),
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
+
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
@@ -360,7 +373,9 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
                             Values(Size(0,0)),//not used
                             Values((BorderType)BORDER_CONSTANT),//not used
                             Values(1.0, 2.0, 3.0),
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
+
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
@@ -368,7 +383,10 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine(
                             Values(Size(0, 0), Size(0, 1), Size(0, 2), Size(0, 3), Size(0, 4), Size(0, 5), Size(0, 6)), // used as generator of operations
                             Values((BorderType)BORDER_CONSTANT),// not used
                             Values(1.0, 2.0, 3.0),
-                            Bool()));
+                            Bool(),
+                            Values(1))); // not used
+
+
 
 } } // namespace cvtest::ocl