static bool ocl_equalizeHist(InputArray _src, OutputArray _dst)
{
- size_t wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);
+ const ocl::Device & dev = ocl::Device::getDefault();
+ int compunits = dev.maxComputeUnits();
+ size_t wgs = dev.maxWorkGroupSize();
+ Size size = _src.size();
+ bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0;
+ int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src));
- // calculation of histogram
- UMat hist;
- if (!ocl_calcHist1(_src, hist))
+ ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
+ format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s",
+ BINS, compunits, wgs, kercn,
+ kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)),
+ _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
+ if (k1.empty())
return false;
- UMat lut(1, 256, CV_8UC1);
- ocl::Kernel k("calcLUT", ocl::imgproc::histogram_oclsrc,
- format("-D BINS=%d -D HISTS_COUNT=1 -D WGS=%d", BINS, (int)wgs));
- if (k.empty())
+ UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1);
+
+ k1.args(ocl::KernelArg::ReadOnly(src),
+ ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total());
+
+ size_t globalsize = compunits * wgs;
+ if (!k1.run(1, &globalsize, &wgs, false))
return false;
- k.args(ocl::KernelArg::PtrWriteOnly(lut),
- ocl::KernelArg::PtrReadOnly(hist), (int)_src.total());
+ wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);
+ UMat lut(1, 256, CV_8UC1);
+ ocl::Kernel k2("calcLUT", ocl::imgproc::histogram_oclsrc,
+ format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d",
+ BINS, compunits, (int)wgs));
+ k2.args(ocl::KernelArg::PtrWriteOnly(lut),
+ ocl::KernelArg::PtrReadOnly(ghist), (int)_src.total());
// calculation of LUT
- if (!k.run(1, &wgs, &wgs, false))
+ if (!k2.run(1, &wgs, &wgs, false))
return false;
// execute LUT transparently
#endif
}
-__kernel void calcLUT(__global uchar * dst, __constant int * hist, int total)
+__kernel void calcLUT(__global uchar * dst, __global const int * ghist, int total)
{
int lid = get_local_id(0);
__local int sumhist[BINS];
__local float scale;
- sumhist[lid] = hist[lid];
+#if WGS >= BINS
+ int res = 0;
+#else
+ #pragma unroll
+ for (int i = lid; i < BINS; i += WGS)
+ sumhist[i] = 0;
+#endif
+
+ #pragma unroll
+ for (int i = 0; i < HISTS_COUNT; ++i)
+ {
+ #pragma unroll
+ for (int j = lid; j < BINS; j += WGS)
+#if WGS >= BINS
+ res += ghist[j];
+#else
+ sumhist[j] += ghist[j];
+#endif
+ ghist += BINS;
+ }
+
+#if WGS >= BINS
+ if (lid < BINS)
+ sumhist[lid] = res;
+#endif
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0)