size_t wgs = dev.maxWorkGroupSize();
size_t lmsz = dev.localMemSize();
-
size_t src_step = _src.step(), src_offset = _src.offset();
+
+ // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
+ int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
+
if (((src_offset % src_step) % esz == 0) &&
(
(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
(_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
) &&
(tileSizeX * tileSizeYmin <= wgs) &&
- (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, cn * 4) <= lmsz)
+ (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz)
)
{
Size size = _src.size(), wholeSize;
int wdepth = CV_32F;
size_t tileSizeY = wgs / tileSizeX;
- while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, cn * 4) > lmsz))
+ while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz))
{
tileSizeY /= 2;
}