Added workaround for Nvidia: take into account that 3-channel vector type takes 4...

author Alexander Karsakov <alexander.karsakov@itseez.com>

Tue, 2 Sep 2014 08:38:02 +0000 (12:38 +0400)

committer Alexander Karsakov <alexander.karsakov@itseez.com>

Tue, 2 Sep 2014 08:38:02 +0000 (12:38 +0400)
author Alexander Karsakov <alexander.karsakov@itseez.com>
Tue, 2 Sep 2014 08:38:02 +0000 (12:38 +0400)
committer Alexander Karsakov <alexander.karsakov@itseez.com>
Tue, 2 Sep 2014 08:38:02 +0000 (12:38 +0400)
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp

index dd8f124..48e9a6f 100644 (file)
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -671,8 +671,11 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
  
      size_t wgs = dev.maxWorkGroupSize();
      size_t lmsz = dev.localMemSize();
-
      size_t src_step = _src.step(), src_offset = _src.offset();
+    
+    // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
+    int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
+
      if (((src_offset % src_step) % esz == 0) &&
          (
           (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
@@ -680,7 +683,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
            (_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
          ) &&
          (tileSizeX * tileSizeYmin  <= wgs) &&
-        (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, cn * 4) <= lmsz)
+        (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz)
         )
      {
          Size size = _src.size(), wholeSize;
@@ -689,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
          int wdepth = CV_32F;
  
          size_t tileSizeY = wgs / tileSizeX;
-        while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, cn * 4) > lmsz))
+        while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz))
          {
              tileSizeY /= 2;
          }
author	Alexander Karsakov <alexander.karsakov@itseez.com>
	Tue, 2 Sep 2014 08:38:02 +0000 (12:38 +0400)
committer	Alexander Karsakov <alexander.karsakov@itseez.com>
	Tue, 2 Sep 2014 08:38:02 +0000 (12:38 +0400)