From a48c1c82486dc89cd8e2acbbdf6085fd6e5aa4ed Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 2 Sep 2014 12:38:02 +0400 Subject: [PATCH] Added workaround for Nvidia: take into account that 3-channel vector type takes 4*elem_size in local memory. --- modules/imgproc/src/deriv.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp index dd8f124..48e9a6f 100644 --- a/modules/imgproc/src/deriv.cpp +++ b/modules/imgproc/src/deriv.cpp @@ -671,8 +671,11 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, size_t wgs = dev.maxWorkGroupSize(); size_t lmsz = dev.localMemSize(); - size_t src_step = _src.step(), src_offset = _src.offset(); + + // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory + int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn; + if (((src_offset % src_step) % esz == 0) && ( (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) || @@ -680,7 +683,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, (_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols)) ) && (tileSizeX * tileSizeYmin <= wgs) && - (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, cn * 4) <= lmsz) + (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz) ) { Size size = _src.size(), wholeSize; @@ -689,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, int wdepth = CV_32F; size_t tileSizeY = wgs / tileSizeX; - while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, cn * 4) > lmsz)) + while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz)) { tileSizeY /= 2; } -- 2.7.4