args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
+ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
+ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
String kernelName = "calcLut";
size_t localThreads[3] = { 32, 8, 1 };
}
static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
- const int tilesX, const int tilesY, const cv::Size tileSize)
+ const int tilesX, const int tilesY, const Size & tileSize)
{
cl_int2 tile_size;
tile_size.s[0] = tileSize.width;
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
+ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
+ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+ args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.offset ));
size_t localThreads[3] = { 32, 8, 1 };
size_t globalThreads[3] = { src.cols, src.rows, 1 };
}
else
{
- cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
+ ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0,
+ tilesX_ - (src.cols % tilesX_), BORDER_REFLECT_101, Scalar::all(0));
- tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+ tileSize = Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
srcForLut = srcExt_;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0)
- {
for (int i = 1; i < 256; ++i)
- {
smem[i] += smem[i - 1];
- }
- }
barrier(CLK_LOCAL_MEM_FENCE);
return smem[tid];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
- {
smem[tid] = val += smem[tid + 128];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
- {
smem[tid] = val += smem[tid + 64];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
- {
smem[tid] += smem[tid + 32];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
- {
smem[tid] += smem[tid + 16];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
- {
smem[tid] += smem[tid + 8];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
- {
smem[tid] += smem[tid + 4];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
- {
smem[tid] += smem[tid + 2];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
- {
smem[256] = smem[tid] + smem[tid + 1];
- }
barrier(CLK_LOCAL_MEM_FENCE);
}
+
#else
+
void reduce(__local volatile int* smem, int val, int tid)
{
smem[tid] = val;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128)
- {
smem[tid] = val += smem[tid + 128];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64)
- {
smem[tid] = val += smem[tid + 64];
- }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 32)
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 16) {
+
+ if (tid < 16)
+ {
#endif
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
- } barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 8) {
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (tid < 8)
+ {
#endif
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int srcStep, const int dstStep,
const int2 tileSize, const int tilesX,
- const int clipLimit, const float lutScale)
+ const int clipLimit, const float lutScale,
+ const int src_offset, const int dst_offset)
{
__local int smem[512];
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
{
- __global const uchar* srcPtr = src + mad24( ty * tileSize.y + i,
- srcStep, tx * tileSize.x );
+ __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
{
const int data = srcPtr[j];
atomic_inc(&smem[data]);
}
}
-
barrier(CLK_LOCAL_MEM_FENCE);
int tHistVal = smem[tid];
-
barrier(CLK_LOCAL_MEM_FENCE);
if (clipLimit > 0)
{
// clip histogram bar
-
int clipped = 0;
if (tHistVal > clipLimit)
{
}
// find number of overall clipped samples
-
reduce(smem, clipped, tid);
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef CPU
const int lutVal = calc_lut(smem, tHistVal, tid);
uint ires = (uint)convert_int_rte(lutScale * lutVal);
- lut[(ty * tilesX + tx) * dstStep + tid] =
+ lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
convert_uchar(clamp(ires, (uint)0, (uint)255));
}
const int srcStep, const int dstStep, const int lutStep,
const int cols, const int rows,
const int2 tileSize,
- const int tilesX, const int tilesY)
+ const int tilesX, const int tilesY,
+ const int src_offset, const int dst_offset, int lut_offset)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
tx1 = max(tx1, 0);
tx2 = min(tx2, tilesX - 1);
- const int srcVal = src[mad24(y, srcStep, x)];
+ const int srcVal = src[mad24(y, srcStep, x + src_offset)];
float res = 0;
- res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya));
- res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya));
- res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya));
- res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya));
+ res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
+ res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
+ res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
+ res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
uint ires = (uint)convert_int_rte(res);
- dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+ dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
}