#ifdef HAVE_OPENCL
+#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
else
kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
+ ocl::Device dev = ocl::Device::getDefault();
+ int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
+
ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
- format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type),
- ocl::memopTypeToStr(depth), cn));
+ format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d", ocl::memopTypeToStr(type),
+ ocl::memopTypeToStr(depth), cn, pxPerWIy));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(src),
ocl::KernelArg::WriteOnly(dst), rows, cols);
- size_t maxWorkGroupSize = ocl::Device::getDefault().maxWorkGroupSize();
+ size_t maxWorkGroupSize = dev.maxWorkGroupSize();
CV_Assert(maxWorkGroupSize % 4 == 0);
+
size_t globalsize[2] = { cols, rows }, localsize[2] = { maxWorkGroupSize / 4, 4 };
- return k.run(2, globalsize, flipType == FLIP_COLS ? localsize : NULL, false);
+ globalsize[1] = DIVUP(globalsize[1], pxPerWIy);
+
+ return k.run(2, globalsize, (flipType == FLIP_COLS) && (!dev.isIntel()) ? localsize : NULL, false);
}
#endif
int rows, int cols, int thread_rows, int thread_cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1)*PIX_PER_WI_Y;
- if (x < cols && y < thread_rows)
+ if (x < cols)
{
- T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
- T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y)
+ {
+ T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+ T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));
- storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
- storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
+ storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
+ }
}
}
int rows, int cols, int thread_rows, int thread_cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1)*PIX_PER_WI_Y;
- if (x < cols && y < thread_rows)
+ if (x < cols)
{
int x1 = cols - x - 1;
- T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
- T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y)
+ {
+ T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+ T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));
- storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
- storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
+ storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ }
}
}
int rows, int cols, int thread_rows, int thread_cols)
{
int x = get_global_id(0);
- int y = get_global_id(1);
+ int y = get_global_id(1)*PIX_PER_WI_Y;
- if (x < thread_cols && y < rows)
+ if (x < thread_cols)
{
int x1 = cols - x - 1;
- T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
- T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));
+ #pragma unroll
+ for (int cy = 0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y)
+ {
+ T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+ T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));
- storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
- storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
+ storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ }
}
}