added area fast mode to ocl::resize

author Ilya Lavrenov <ilya.lavrenov@itseez.com>

Mon, 18 Nov 2013 14:46:59 +0000 (18:46 +0400)

committer Ilya Lavrenov <ilya.lavrenov@itseez.com>

Tue, 19 Nov 2013 09:43:50 +0000 (13:43 +0400)
author Ilya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 18 Nov 2013 14:46:59 +0000 (18:46 +0400)
committer Ilya Lavrenov <ilya.lavrenov@itseez.com>
Tue, 19 Nov 2013 09:43:50 +0000 (13:43 +0400)
diff --git a/modules/ocl/perf/perf_imgwarp.cpp b/modules/ocl/perf/perf_imgwarp.cpp

index 62f334549336571a9fb9df5b2424ba7c3a92fa22..e768d6621908a90d6701869b0515761872bc41ad 100644 (file)
--- a/modules/ocl/perf/perf_imgwarp.cpp
+++ b/modules/ocl/perf/perf_imgwarp.cpp
@@ -191,7 +191,7 @@ typedef TestBaseWithParam<resizeAreaParams> resizeAreaFixture;
  PERF_TEST_P(resizeAreaFixture, resize,
              ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
                                 OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                               ::testing::Values(0.6, 0.3)))
+                               ::testing::Values(0.3, 0.5, 0.6)))
  {
      const resizeAreaParams params = GetParam();
      const Size srcSize = get<0>(params);
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp

index d06adb0985de909f29cff41b62c4cd77816395f2..e8f42edea6fdf5fad73080dbc92187df4fe6d8c6 100644 (file)
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -318,28 +318,46 @@ namespace cv
              ofs_tab[dx] = k;
          }
  
+        static void computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol)
+        {
+            for (int i = 0; i < dcols; ++i)
+                dmap_tab[i] = scale * i;
+
+            for (int i = 0, size = dcols * scale; i < size; ++i)
+                smap_tab[i] = std::min(scol - 1, i);
+        }
+
          static void resize_gpu( const oclMat &src, oclMat &dst, double ifx, double ify, int interpolation)
          {
              float ifxf = (float)ifx, ifyf = (float)ify;
              int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
              int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-            int ocn = interpolation == INTER_LINEAR ? dst.oclchannels() : -1;
-            int depth = interpolation == INTER_LINEAR ? dst.depth() : -1;
+            int ocn = dst.oclchannels(), depth = dst.depth();
  
              const char * const interMap[] = { "NN", "LN", "CUBIC", "AREA", "LAN4" };
              std::string kernelName = std::string("resize") + interMap[interpolation];
  
              const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
              const char * const channelMap[] = { "" , "", "2", "4", "4" };
-            std::string buildOption = format("-D %s -D T=%s%s", interMap[interpolation], typeMap[dst.depth()], channelMap[dst.oclchannels()]);
+            std::string buildOption = format("-D %s -D T=%s%s", interMap[interpolation], typeMap[depth], channelMap[ocn]);
  
              int wdepth = std::max(src.depth(), CV_32F);
+
+            // check if fx, fy is integer and then we have inter area fast mode
+            int iscale_x = saturate_cast<int>(ifx);
+            int iscale_y = saturate_cast<int>(ify);
+
+            bool is_area_fast = std::abs(ifx - iscale_x) < DBL_EPSILON &&
+                std::abs(ify - iscale_y) < DBL_EPSILON;
+            if (is_area_fast)
+                wdepth = std::max(src.depth(), CV_32S);
+
              if (interpolation != INTER_NEAREST)
              {
                  buildOption += format(" -D WT=%s -D WTV=%s%s -D convertToWTV=convert_%s%s -D convertToT=convert_%s%s%s",
-                                      typeMap[wdepth], typeMap[wdepth], channelMap[dst.oclchannels()],
-                                      typeMap[wdepth], channelMap[dst.oclchannels()],
-                                      typeMap[src.depth()], channelMap[dst.oclchannels()], src.depth() <= CV_32S ? "_sat_rte" : "");
+                                      typeMap[wdepth], typeMap[wdepth], channelMap[ocn],
+                                      typeMap[wdepth], channelMap[ocn],
+                                      typeMap[src.depth()], channelMap[ocn], src.depth() <= CV_32S ? "_sat_rte" : "");
              }
  
              size_t blkSizeX = 16, blkSizeY = 16;
@@ -352,26 +370,48 @@ namespace cv
              else
                  glbSizeX = dst.cols;
  
-            static oclMat alphaOcl, mapOcl, tabofsOcl;
+            oclMat alphaOcl, mapOcl, tabofsOcl;
              if (interpolation == INTER_AREA)
              {
-                Size ssize = src.size(), dsize = dst.size();
-                int xytab_size = (ssize.width + ssize.height) << 1;
-                int tabofs_size = dsize.height + dsize.width + 2;
-
-                AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
-                AutoBuffer<float> _xyalpha_tab(xytab_size);
-                int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
-                float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
-                int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
-
-                computeResizeAreaTabs(ssize.width, dsize.width, ifx, xmap_tab, xalpha_tab, xofs_tab);
-                computeResizeAreaTabs(ssize.height, dsize.height, ify, ymap_tab, yalpha_tab, yofs_tab);
-
-                // loading precomputed arrays to GPU
-                alphaOcl = oclMat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab);
-                mapOcl = oclMat(1, xytab_size, CV_32SC1, (void *)_xymap_tab);
-                tabofsOcl = oclMat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab);
+                if (is_area_fast)
+                {
+                    kernelName += "_FAST";
+                    int wdepth2 = std::max(CV_32F, src.depth());
+                    buildOption += format(" -D WT2V=%s%s -D convertToWT2V=convert_%s%s -D AREA_FAST -D XSCALE=%d -D YSCALE=%d -D SCALE=%f",
+                                          typeMap[wdepth2], channelMap[ocn], typeMap[wdepth2], channelMap[ocn],
+                                          iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
+
+                    int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y;
+                    AutoBuffer<int> dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size);
+                    int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols;
+                    int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y;
+
+                    computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols);
+                    computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows);
+
+                    tabofsOcl = oclMat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab);
+                    mapOcl = oclMat(1, smap_tab_size, CV_32SC1, (void *)smap_tab);
+                }
+                else
+                {
+                    Size ssize = src.size(), dsize = dst.size();
+                    int xytab_size = (ssize.width + ssize.height) << 1;
+                    int tabofs_size = dsize.height + dsize.width + 2;
+
+                    AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
+                    AutoBuffer<float> _xyalpha_tab(xytab_size);
+                    int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
+                    float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
+                    int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
+
+                    computeResizeAreaTabs(ssize.width, dsize.width, ifx, xmap_tab, xalpha_tab, xofs_tab);
+                    computeResizeAreaTabs(ssize.height, dsize.height, ify, ymap_tab, yalpha_tab, yofs_tab);
+
+                    // loading precomputed arrays to GPU
+                    alphaOcl = oclMat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab);
+                    mapOcl = oclMat(1, xytab_size, CV_32SC1, (void *)_xymap_tab);
+                    tabofsOcl = oclMat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab);
+                }
              }
  
              size_t globalThreads[3] = { glbSizeX, dst.rows, 1 };
@@ -400,12 +440,18 @@ namespace cv
                  args.push_back( make_pair(sizeof(cl_float), (void *)&ifyf));
              }
  
-            if (interpolation == INTER_AREA)
-            {
+            // precomputed tabs
+            if (!tabofsOcl.empty())
                  args.push_back( make_pair(sizeof(cl_mem), (void *)&tabofsOcl.data));
+
+            if (!mapOcl.empty())
                  args.push_back( make_pair(sizeof(cl_mem), (void *)&mapOcl.data));
+
+            if (!alphaOcl.empty())
                  args.push_back( make_pair(sizeof(cl_mem), (void *)&alphaOcl.data));
-            }
+
+            ocn = interpolation == INTER_LINEAR ? ocn : -1;
+            depth = interpolation == INTER_LINEAR ? depth : -1;
  
              openCLExecuteKernel(src.clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args,
                                  ocn, depth, buildOption.c_str());
diff --git a/modules/ocl/src/opencl/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl

index 5a69e87862292ebb7105d1fcc69e7eb0678d3db8..ebf8c712b7ef3188fcf0c8df2be7516794ca4716 100644 (file)
--- a/modules/ocl/src/opencl/imgproc_resize.cl
+++ b/modules/ocl/src/opencl/imgproc_resize.cl
@@ -315,7 +315,44 @@ __kernel void resizeNN(__global T * dst, __global T * src,
      }
  }
  
-#elif AREA
+#elif defined AREA
+
+#ifdef AREA_FAST
+
+__kernel void resizeAREA_FAST(__global T * dst, __global T * src,
+                         int dst_offset, int src_offset, int dst_step, int src_step,
+                         int src_cols, int src_rows, int dst_cols, int dst_rows, WT ifx, WT ify,
+                         __global const int * dmap_tab, __global const int * smap_tab)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        int dst_index = mad24(dy, dst_step, dst_offset + dx);
+
+        __global const int * xmap_tab = dmap_tab;
+        __global const int * ymap_tab = dmap_tab + dst_cols;
+        __global const int * sxmap_tab = smap_tab;
+        __global const int * symap_tab = smap_tab + XSCALE * dst_cols;
+
+        int sx = xmap_tab[dx], sy = ymap_tab[dy];
+        WTV sum = (WTV)(0);
+
+        #pragma unroll
+        for (int y = 0; y < YSCALE; ++y)
+        {
+            int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
+            #pragma unroll
+            for (int x = 0; x < XSCALE; ++x)
+                sum += convertToWTV(src[src_index + sxmap_tab[sx + x]]);
+        }
+
+        dst[dst_index] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE));
+    }
+}
+
+#else
  
  __kernel void resizeAREA(__global T * dst, __global T * src,
                           int dst_offset, int src_offset, int dst_step, int src_step,
@@ -364,3 +401,5 @@ __kernel void resizeAREA(__global T * dst, __global T * src,
  }
  
  #endif
+
+#endif
diff --git a/modules/ocl/test/utility.cpp b/modules/ocl/test/utility.cpp

index f98604292969ed950520749b8366e2c1a0851afe..c9cb0d8d67d5be074c83ff77b73102cf7380d6fb 100644 (file)
--- a/modules/ocl/test/utility.cpp
+++ b/modules/ocl/test/utility.cpp
@@ -231,7 +231,7 @@ double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& o
      return final_test_result;
  }
  
-void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
+void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
  {
      Mat diff, diff_thresh;
      absdiff(gold, actual, diff);
@@ -240,10 +240,18 @@ void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
  
      if (alwaysShow || cv::countNonZero(diff_thresh.reshape(1)) > 0)
      {
+#if 0
+        std::cout << "Src: " << std::endl << src << std::endl;
+        std::cout << "Reference: " << std::endl << gold << std::endl;
+        std::cout << "OpenCL: " << std::endl << actual << std::endl;
+#endif
+
+        namedWindow("src", WINDOW_NORMAL);
          namedWindow("gold", WINDOW_NORMAL);
          namedWindow("actual", WINDOW_NORMAL);
          namedWindow("diff", WINDOW_NORMAL);
  
+        imshow("src", src);
          imshow("gold", gold);
          imshow("actual", actual);
          imshow("diff", diff);
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp

index bfbe579361914070fd850bfef82a4a828eee0f82..a1fe3ffb75e601513dcfd32259988ca672fee287 100644 (file)
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -52,7 +52,7 @@ extern int LOOP_TIMES;
  
  namespace cvtest {
  
-void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false);
+void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false);
  
  cv::ocl::oclMat createMat_ocl(cv::RNG& rng, Size size, int type, bool useRoi);
  cv::ocl::oclMat loadMat_ocl(cv::RNG& rng, const Mat& m, bool useRoi);
author	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Mon, 18 Nov 2013 14:46:59 +0000 (18:46 +0400)
committer	Ilya Lavrenov <ilya.lavrenov@itseez.com>
	Tue, 19 Nov 2013 09:43:50 +0000 (13:43 +0400)
modules/ocl/perf/perf_imgwarp.cpp		patch \| blob \| history
modules/ocl/src/imgproc.cpp		patch \| blob \| history
modules/ocl/src/opencl/imgproc_resize.cl		patch \| blob \| history
modules/ocl/test/utility.cpp		patch \| blob \| history
modules/ocl/test/utility.hpp		patch \| blob \| history