From 1d8cd3a717160f2097bcb7765097ed6196b3b535 Mon Sep 17 00:00:00 2001 From: Peng Xiao Date: Mon, 10 Jun 2013 18:37:48 +0800 Subject: [PATCH] Add ocl CLACH implementation. Test cases (accuracy and performance) are provided. --- modules/ocl/include/opencv2/ocl/ocl.hpp | 17 ++ modules/ocl/perf/perf_imgproc.cpp | 49 +++++- modules/ocl/src/imgproc.cpp | 185 +++++++++++++++++++++ modules/ocl/src/opencl/imgproc_clahe.cl | 275 ++++++++++++++++++++++++++++++++ modules/ocl/test/test_imgproc.cpp | 45 ++++++ 5 files changed, 570 insertions(+), 1 deletion(-) create mode 100644 modules/ocl/src/opencl/imgproc_clahe.cl diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 01b0f72..4a5debf 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -483,6 +483,23 @@ namespace cv CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist); //! only 8UC1 and 256 bins is supported now CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst); + + //! only 8UC1 is supported now + class CV_EXPORTS CLAHE + { + public: + virtual void apply(const oclMat &src, oclMat &dst) = 0; + + virtual void setClipLimit(double clipLimit) = 0; + virtual double getClipLimit() const = 0; + + virtual void setTilesGridSize(Size tileGridSize) = 0; + virtual Size getTilesGridSize() const = 0; + + virtual void collectGarbage() = 0; + }; + CV_EXPORTS Ptr createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8)); + //! bilateralFilter // supports 8UC1 8UC4 CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpave, int borderType=BORDER_DEFAULT); diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp index 0aef8b2..e87e821 100644 --- a/modules/ocl/perf/perf_imgproc.cpp +++ b/modules/ocl/perf/perf_imgproc.cpp @@ -921,4 +921,51 @@ PERFTEST(remap) } } -} \ No newline at end of file +} +///////////// CLAHE //////////////////////// +PERFTEST(CLAHE) +{ + Mat src, dst, ocl_dst; + cv::ocl::oclMat d_src, d_dst; + int all_type[] = {CV_8UC1}; + std::string type_name[] = {"CV_8UC1"}; + + double clipLimit = 40.0; + + cv::Ptr clahe = cv::createCLAHE(clipLimit); + cv::Ptr d_clahe = cv::ocl::createCLAHE(clipLimit); + + for (int size = Min_Size; size <= Max_Size; size *= Multiple) + { + for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++) + { + SUBTEST << size << 'x' << size << "; " << type_name[j] ; + + gen(src, size, size, all_type[j], 0, 256); + + CPU_ON; + clahe->apply(src, dst); + CPU_OFF; + + d_src.upload(src); + + WARMUP_ON; + d_clahe->apply(d_src, d_dst); + WARMUP_OFF; + + ocl_dst = d_dst; + + TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0); + + GPU_ON; + d_clahe->apply(d_src, d_dst); + GPU_OFF; + + GPU_FULL_ON; + d_src.upload(src); + d_clahe->apply(d_src, d_dst); + d_dst.download(dst); + GPU_FULL_OFF; + } + } +} diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index ef48b8e..3dbd68d 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -25,6 +25,7 @@ // Xu Pang, pangxu010@163.com // Wu Zailong, bullet@yeah.net // Wenju He, wenju@multicorewareinc.com +// Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -80,6 +81,7 @@ namespace cv extern const char *imgproc_calcHarris; extern const char *imgproc_calcMinEigenVal; extern const char *imgproc_convolve; + extern const char *imgproc_clahe; ////////////////////////////////////OpenCL call wrappers//////////////////////////// template struct index_and_sizeof; @@ -1511,6 +1513,189 @@ namespace cv openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1); LUT(mat_src, lut, mat_dst); } + + //////////////////////////////////////////////////////////////////////// + // CLAHE + namespace clahe + { + inline int divUp(int total, int grain) + { + return (total + grain - 1) / grain * grain; + } + + static void calcLut(const oclMat &src, oclMat &dst, + const int tilesX, const int tilesY, const cv::Size tileSize, + const int clipLimit, const float lutScale) + { + cl_int2 tile_size; + tile_size.s[0] = tileSize.width; + tile_size.s[1] = tileSize.height; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step )); + args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit )); + args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale )); + + String kernelName = "calcLut"; + size_t localThreads[3] = { 32, 8, 1 }; + size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 }; + bool is_cpu = queryDeviceInfo(); + if (is_cpu) + { + openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU"); + } + else + { + cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName); + int wave_size = queryDeviceInfo(kernel); + openCLSafeCall(clReleaseKernel(kernel)); + + static char opt[20] = {0}; + sprintf(opt, " -D WAVE_SIZE=%d", wave_size); + openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt); + } + } + + static void transform(const oclMat &src, oclMat &dst, const oclMat &lut, + const int tilesX, const int tilesY, const cv::Size tileSize) + { + cl_int2 tile_size; + tile_size.s[0] = tileSize.width; + tile_size.s[1] = tileSize.height; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data )); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows )); + args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY )); + + String kernelName = "transform"; + size_t localThreads[3] = { 32, 8, 1 }; + size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 }; + + openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1); + } + } + + namespace + { + class CLAHE_Impl : public cv::ocl::CLAHE + { + public: + CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8); + + cv::AlgorithmInfo* info() const; + + void apply(const oclMat &src, oclMat &dst); + + void setClipLimit(double clipLimit); + double getClipLimit() const; + + void setTilesGridSize(cv::Size tileGridSize); + cv::Size getTilesGridSize() const; + + void collectGarbage(); + + private: + double clipLimit_; + int tilesX_; + int tilesY_; + + oclMat srcExt_; + oclMat lut_; + }; + + CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) : + clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY) + { + } + + void CLAHE_Impl::apply(const oclMat &src, oclMat &dst) + { + CV_Assert( src.type() == CV_8UC1 ); + + dst.create( src.size(), src.type() ); + + const int histSize = 256; + + ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_); + + cv::Size tileSize; + oclMat srcForLut; + + if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0) + { + tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_); + srcForLut = src; + } + else + { + cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar()); + + tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_); + srcForLut = srcExt_; + } + + const int tileSizeTotal = tileSize.area(); + const float lutScale = static_cast(histSize - 1) / tileSizeTotal; + + int clipLimit = 0; + if (clipLimit_ > 0.0) + { + clipLimit = static_cast(clipLimit_ * tileSizeTotal / histSize); + clipLimit = std::max(clipLimit, 1); + } + + clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale); + //finish(); + clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize); + } + + void CLAHE_Impl::setClipLimit(double clipLimit) + { + clipLimit_ = clipLimit; + } + + double CLAHE_Impl::getClipLimit() const + { + return clipLimit_; + } + + void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize) + { + tilesX_ = tileGridSize.width; + tilesY_ = tileGridSize.height; + } + + cv::Size CLAHE_Impl::getTilesGridSize() const + { + return cv::Size(tilesX_, tilesY_); + } + + void CLAHE_Impl::collectGarbage() + { + srcExt_.release(); + lut_.release(); + } + } + + cv::Ptr createCLAHE(double clipLimit, cv::Size tileGridSize) + { + return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height); + } + //////////////////////////////////bilateralFilter//////////////////////////////////////////////////// static void oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d, diff --git a/modules/ocl/src/opencl/imgproc_clahe.cl b/modules/ocl/src/opencl/imgproc_clahe.cl new file mode 100644 index 0000000..0d010f7 --- /dev/null +++ b/modules/ocl/src/opencl/imgproc_clahe.cl @@ -0,0 +1,275 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Sen Liu, swjtuls1987@126.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + +int calc_lut(__local int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid == 0) + { + for (int i = 1; i < 256; ++i) + { + smem[i] += smem[i - 1]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + return smem[tid]; +} + +#ifdef CPU +void reduce(volatile __local int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + { + smem[tid] = val += smem[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + { + smem[tid] = val += smem[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem[tid] += smem[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem[tid] += smem[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem[tid] += smem[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem[tid] += smem[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem[tid] += smem[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem[256] = smem[tid] + smem[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} +#else +void reduce(__local volatile int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + { + smem[tid] = val += smem[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + { + smem[tid] = val += smem[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) { +#endif + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } +} +#endif + +__kernel void calcLut(__global __const uchar * src, __global uchar * lut, + const int srcStep, const int dstStep, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale) +{ + __local int smem[512]; + + const int tx = get_group_id(0); + const int ty = get_group_id(1); + const unsigned int tid = get_local_id(1) * get_local_size(0) + + get_local_id(0); + + smem[tid] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) + { + __global const uchar* srcPtr = src + mad24( ty * tileSize.y + i, + srcStep, tx * tileSize.x ); + for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) + { + const int data = srcPtr[j]; + atomic_inc(&smem[data]); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int tHistVal = smem[tid]; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (clipLimit > 0) + { + // clip histogram bar + + int clipped = 0; + if (tHistVal > clipLimit) + { + clipped = tHistVal - clipLimit; + tHistVal = clipLimit; + } + + // find number of overall clipped samples + + reduce(smem, clipped, tid); + barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + clipped = smem[256]; +#else + clipped = smem[0]; +#endif + + // broadcast evaluated value + + __local int totalClipped; + + if (tid == 0) + totalClipped = clipped; + barrier(CLK_LOCAL_MEM_FENCE); + + // redistribute clipped samples evenly + + int redistBatch = totalClipped / 256; + tHistVal += redistBatch; + + int residual = totalClipped - redistBatch * 256; + if (tid < residual) + ++tHistVal; + } + + const int lutVal = calc_lut(smem, tHistVal, tid); + uint ires = (uint)convert_int_rte(lutScale * lutVal); + lut[(ty * tilesX + tx) * dstStep + tid] = + convert_uchar(clamp(ires, (uint)0, (uint)255)); +} + +__kernel void transform(__global __const uchar * src, + __global uchar * dst, + __global uchar * lut, + const int srcStep, const int dstStep, const int lutStep, + const int cols, const int rows, + const int2 tileSize, + const int tilesX, const int tilesY) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= cols || y >= rows) + return; + + const float tyf = (convert_float(y) / tileSize.y) - 0.5f; + int ty1 = convert_int_rtn(tyf); + int ty2 = ty1 + 1; + const float ya = tyf - ty1; + ty1 = max(ty1, 0); + ty2 = min(ty2, tilesY - 1); + + const float txf = (convert_float(x) / tileSize.x) - 0.5f; + int tx1 = convert_int_rtn(txf); + int tx2 = tx1 + 1; + const float xa = txf - tx1; + tx1 = max(tx1, 0); + tx2 = min(tx2, tilesX - 1); + + const int srcVal = src[mad24(y, srcStep, x)]; + + float res = 0; + + res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya)); + res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya)); + res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya)); + res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya)); + + uint ires = (uint)convert_int_rte(res); + dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); +} diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index 664f8a3..b9f4740 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -23,6 +23,7 @@ // Rock Li, Rock.Li@amd.com // Wu Zailong, bullet@yeah.net // Xu Pang, pangxu010@163.com +// Sen Liu, swjtuls1987@126.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -1393,6 +1394,46 @@ TEST_P(calcHist, Mat) EXPECT_MAT_NEAR(dst_hist, cpu_hist, 0.0); } } +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// CLAHE +namespace +{ + IMPLEMENT_PARAM_CLASS(ClipLimit, double) +} + +PARAM_TEST_CASE(CLAHE, cv::Size, ClipLimit) +{ + cv::Size size; + double clipLimit; + + cv::Mat src; + cv::Mat dst_gold; + + cv::ocl::oclMat g_src; + cv::ocl::oclMat g_dst; + + virtual void SetUp() + { + size = GET_PARAM(0); + clipLimit = GET_PARAM(1); + + cv::RNG &rng = TS::ptr()->get_rng(); + src = randomMat(rng, size, CV_8UC1, 0, 256, false); + g_src.upload(src); + } +}; + +TEST_P(CLAHE, Accuracy) +{ + cv::Ptr clahe = cv::ocl::createCLAHE(clipLimit); + clahe->apply(g_src, g_dst); + cv::Mat dst(g_dst); + + cv::Ptr clahe_gold = cv::createCLAHE(clipLimit); + clahe_gold->apply(src, dst_gold); + + EXPECT_MAT_NEAR(dst_gold, dst, 1.0); +} ///////////////////////////Convolve////////////////////////////////// PARAM_TEST_CASE(ConvolveTestBase, MatType, bool) @@ -1643,6 +1684,10 @@ INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine( ONE_TYPE(CV_32SC1) //no use )); +INSTANTIATE_TEST_CASE_P(ImgProc, CLAHE, Combine( + Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(1300, 1300)), + Values(0.0, 40.0))); + //INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine( // Values(CV_32FC1, CV_32FC1), // Values(false))); // Values(false) is the reserved parameter -- 2.7.4