From 0c0ebca85566fb2e327f4c0cd24dc71b98c61ea9 Mon Sep 17 00:00:00 2001 From: VBystricky Date: Wed, 4 Jun 2014 23:50:23 +0400 Subject: [PATCH] Read 4 pixel for aligned data with 1 channel --- modules/core/src/convert.cpp | 8 ++++++-- modules/core/src/opencl/lut.cl | 22 ++++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 1f53fa4..162eaac 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1548,10 +1548,12 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) UMat src = _src.getUMat(), lut = _lut.getUMat(); _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn)); UMat dst = _dst.getUMat(); + bool bAligned = (1 == dcn) && (0 == (src.offset % 4)) && (0 == (src.cols % 4)); ocl::Kernel k("LUT", ocl::core::lut_oclsrc, - format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", dcn, lcn, - ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth) + format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s%s", dcn, lcn, + ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth), + bAligned ? " -D USE_ALIGNED" : "" )); if (k.empty()) return false; @@ -1560,6 +1562,8 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst) ocl::KernelArg::WriteOnly(dst)); size_t globalSize[2] = { dst.cols, (dst.rows + 3) / 4}; + if (bAligned) + globalSize[0] = (dst.cols + 3) / 4; return k.run(2, globalSize, NULL, false); } diff --git a/modules/core/src/opencl/lut.cl b/modules/core/src/opencl/lut.cl index a33d50c..295f0ae 100644 --- a/modules/core/src/opencl/lut.cl +++ b/modules/core/src/opencl/lut.cl @@ -57,10 +57,20 @@ dst[0] = lut_l[idx->x];\ dst[1] = lut_l[idx->y]; #elif dcn == 1 - #define LUT_OP(num)\ - uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\ - dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\ - dst[0] = lut_l[idx]; + #ifdef USE_ALIGNED + #define LUT_OP(num)\ + int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\ + dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\ + dst[0] = lut_l[idx & 0xff];\ + dst[1] = lut_l[(idx >> 8) & 0xff];\ + dst[2] = lut_l[(idx >> 16) & 0xff];\ + dst[3] = lut_l[(idx >> 24) & 0xff]; + #else + #define LUT_OP(num)\ + uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\ + dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\ + dst[0] = lut_l[idx]; + #endif #else #define LUT_OP(num)\ src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\ @@ -126,7 +136,11 @@ __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset, __local dstT lut_l[256 * lcn]; LOCAL_LUT_INIT; +#ifdef USE_ALIGNED + int x = 4 * get_global_id(0); +#else int x = get_global_id(0); +#endif int y = 4 * get_global_id(1); if (x < cols && y < rows) -- 2.7.4