merge_histogram kernel only need "BINS" theads to accumulate the
histgrams, it is not efficient to directly use maxGroupSize as
local size if maxGroupSize is far greater then BINS.
if (!k1.run(1, &globalsize, &wgs, false))
return false;
+ wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);
char cvt[40];
ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D convertToHT=%s -D HT=%s",