From 9a4265a8d08412acbb0a8a5bf203fca1a6a50f0f Mon Sep 17 00:00:00 2001 From: Anatoly Baksheev Date: Thu, 4 Oct 2012 19:36:48 +0400 Subject: [PATCH] fast nlm (class version) --- modules/gpu/doc/image_processing.rst | 72 ++++++++- modules/gpu/include/opencv2/gpu/gpu.hpp | 19 ++- modules/gpu/perf/perf_denoising.cpp | 91 ++++++++--- modules/gpu/src/cuda/copy_make_border.cu | 2 +- modules/gpu/src/cuda/nlm.cu | 248 ++++++++++++++++------------- modules/gpu/src/denoising.cpp | 265 +++++++------------------------ modules/gpu/src/imgproc.cpp | 4 +- modules/gpu/test/test_denoising.cpp | 34 ++-- 8 files changed, 376 insertions(+), 359 deletions(-) diff --git a/modules/gpu/doc/image_processing.rst b/modules/gpu/doc/image_processing.rst index 2b4bb2e..3d6bc76 100644 --- a/modules/gpu/doc/image_processing.rst +++ b/modules/gpu/doc/image_processing.rst @@ -849,15 +849,15 @@ gpu::nonLocalMeans ------------------- Performs pure non local means denoising without any simplification, and thus it is not fast. -.. ocv:function:: void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_widow_size = 11, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null()) +.. ocv:function:: void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null()) :param src: Source image. Supports only CV_8UC1, CV_8UC2 and CV_8UC3. - :param dst: Destination imagwe. + :param dst: Destination image. :param h: Filter sigma regulating filter strength for color. - :param search_widow_size: Size of search window. + :param search_window: Size of search window. :param block_size: Size of block used for computing weights. @@ -868,7 +868,73 @@ Performs pure non local means denoising without any simplification, and thus it .. seealso:: :ocv:func:`fastNlMeansDenoising` + +gpu::FastNonLocalMeansDenoising +------------------------------- +.. ocv:class:: gpu::FastNonLocalMeansDenoising + + class FastNonLocalMeansDenoising + { + public: + //! Simple method, recommended for grayscale images (though it supports multichannel images) + void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null()); + + //! Processes luminance and color components separatelly + void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null()); + }; + +The class implements fast approximate Non Local Means Denoising algorithm. + +gpu::FastNonLocalMeansDenoising::simpleMethod() +------------------------------------- +Perform image denoising using Non-local Means Denoising algorithm http://www.ipol.im/pub/algo/bcm_non_local_means_denoising with several computational optimizations. Noise expected to be a gaussian white noise + +.. ocv:function:: void gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null()); + + :param src: Input 8-bit 1-channel, 2-channel or 3-channel image. + + :param dst: Output image with the same size and type as ``src`` . + + :param h: Parameter regulating filter strength. Big h value perfectly removes noise but also removes image details, smaller h value preserves details but also preserves some noise + + :param search_window: Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels + + :param block_size: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels + + :param stream: Stream for the asynchronous invocations. + +This function expected to be applied to grayscale images. For colored images look at ``FastNonLocalMeansDenoising::labMethod``. + +.. seealso:: + + :ocv:func:`fastNlMeansDenoising` + +gpu::FastNonLocalMeansDenoising::labMethod() +------------------------------------- +Modification of ``FastNonLocalMeansDenoising::simpleMethod`` for color images + +.. ocv:function:: void gpu::FastNonLocalMeansDenoising::labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null()); + + :param src: Input 8-bit 3-channel image. + + :param dst: Output image with the same size and type as ``src`` . + + :param h_luminance: Parameter regulating filter strength. Big h value perfectly removes noise but also removes image details, smaller h value preserves details but also preserves some noise + + :param float: The same as h but for color components. For most images value equals 10 will be enought to remove colored noise and do not distort colors + + :param search_window: Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels + + :param block_size: Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels + + :param stream: Stream for the asynchronous invocations. + +The function converts image to CIELAB colorspace and then separately denoise L and AB components with given h parameters using ``FastNonLocalMeansDenoising::simpleMethod`` function. + +.. seealso:: + :ocv:func:`fastNlMeansDenoisingColored` + gpu::alphaComp ------------------- Composites two images using alpha opacity values contained in each image. diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 408648a..f62b9a4 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -774,11 +774,24 @@ CV_EXPORTS void bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, int borderMode = BORDER_DEFAULT, Stream& stream = Stream::Null()); //! Brute force non-local means algorith (slow but universal) -CV_EXPORTS void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, - int search_widow_size = 11, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null()); +CV_EXPORTS void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null()); //! Fast (but approximate)version of non-local means algorith similar to CPU function (running sums technique) -CV_EXPORTS void fastNlMeansDenoising( const GpuMat& src, GpuMat& dst, float h, int search_radius = 10, int block_radius = 3, Stream& s = Stream::Null()); +class CV_EXPORTS FastNonLocalMeansDenoising +{ +public: + //! Simple method, recommended for grayscale images (though it supports multichannel images) + void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null()); + + //! Processes luminance and color components separatelly + void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window = 21, int block_size = 7, Stream& s = Stream::Null()); + +private: + + GpuMat buffer, extended_src_buffer; + GpuMat lab, l, ab; +}; + struct CV_EXPORTS CannyBuf; diff --git a/modules/gpu/perf/perf_denoising.cpp b/modules/gpu/perf/perf_denoising.cpp index 9d195ef..ba3dd07 100644 --- a/modules/gpu/perf/perf_denoising.cpp +++ b/modules/gpu/perf/perf_denoising.cpp @@ -3,16 +3,18 @@ using namespace std; using namespace testing; +#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::szXGA, perf::sz720p, perf::sz1080p) + ////////////////////////////////////////////////////////////////////// // BilateralFilter -DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth , int, int); +DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, int, int); PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter, - Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F), GPU_CHANNELS_1_3_4, Values(3, 5, 9))) + Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U, CV_32F), testing::Values(1, 3), Values(3, 5, 9))) { - declare.time(30.0); + declare.time(60.0); cv::Size size = GET_PARAM(0); int depth = GET_PARAM(1); @@ -57,16 +59,16 @@ PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter, ////////////////////////////////////////////////////////////////////// // nonLocalMeans -DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth , int, int, int); +DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, int, int, int); PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans, - Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U), Values(1), Values(21), Values(5, 7))) + Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U), Values(1, 3), Values(21), Values(5, 7))) { - declare.time(30.0); + declare.time(60.0); cv::Size size = GET_PARAM(0); - int depth = GET_PARAM(1); - int channels = GET_PARAM(2); + int depth = GET_PARAM(1); + int channels = GET_PARAM(2); int search_widow_size = GET_PARAM(3); int block_size = GET_PARAM(4); @@ -101,22 +103,67 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans, ////////////////////////////////////////////////////////////////////// // fastNonLocalMeans -DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth , int, int, int); +DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int); -PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans, - Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U), Values(1), Values(21), Values(5, 7))) +PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeans, + Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U), Values(21), Values(7))) { - declare.time(30.0); + declare.time(150.0); + + cv::Size size = GET_PARAM(0); + int depth = GET_PARAM(1); + + int search_widow_size = GET_PARAM(2); + int block_size = GET_PARAM(3); + + float h = 10; + int type = CV_MAKE_TYPE(depth, 1); + + cv::Mat src(size, type); + fillRandom(src); + if (runOnGpu) + { + cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat d_dst; + cv::gpu::FastNonLocalMeansDenoising fnlmd; + + fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size); + + TEST_CYCLE() + { + fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size); + } + } + else + { + cv::Mat dst; + cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size); + + TEST_CYCLE() + { + cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size); + } + } +} + +////////////////////////////////////////////////////////////////////// +// fastNonLocalMeans (colored) + + +PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored, + Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U), Values(21), Values(7))) +{ + declare.time(350.0); + cv::Size size = GET_PARAM(0); int depth = GET_PARAM(1); - int channels = GET_PARAM(2); - int search_widow_size = GET_PARAM(3); - int block_size = GET_PARAM(4); + int search_widow_size = GET_PARAM(2); + int block_size = GET_PARAM(3); float h = 10; - int type = CV_MAKE_TYPE(depth, channels); + int type = CV_MAKE_TYPE(depth, 3); cv::Mat src(size, type); fillRandom(src); @@ -124,22 +171,24 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans, if (runOnGpu) { cv::gpu::GpuMat d_src(src); - cv::gpu::GpuMat d_dst; - cv::gpu::fastNlMeansDenoising(d_src, d_dst, h, search_widow_size/2, block_size/2); + cv::gpu::GpuMat d_dst; + cv::gpu::FastNonLocalMeansDenoising fnlmd; + + fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size); TEST_CYCLE() { - cv::gpu::fastNlMeansDenoising(d_src, d_dst, h, search_widow_size/2, block_size/2); + fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size); } } else { cv::Mat dst; - cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size); + cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size); TEST_CYCLE() { - cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size); + cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size); } } } \ No newline at end of file diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu index 74120ba..73a2ec6 100644 --- a/modules/gpu/src/cuda/copy_make_border.cu +++ b/modules/gpu/src/cuda/copy_make_border.cu @@ -97,7 +97,7 @@ namespace cv { namespace gpu { namespace device } template void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); - //template void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); + template void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); diff --git a/modules/gpu/src/cuda/nlm.cu b/modules/gpu/src/cuda/nlm.cu index 6ee7c8f..a1e323a 100644 --- a/modules/gpu/src/cuda/nlm.cu +++ b/modules/gpu/src/cuda/nlm.cu @@ -68,68 +68,70 @@ namespace cv { namespace gpu { namespace device __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w; } template - __global__ void nlm_kernel(const PtrStepSz src, PtrStep dst, const B b, int search_radius, int block_radius, float h2_inv_half) + __global__ void nlm_kernel(const PtrStep src, PtrStepSz dst, const B b, int search_radius, int block_radius, float noise_mult) { typedef typename TypeVec::cn>::vec_type value_type; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (x >= src.cols || y >= src.rows) + const int i = blockDim.y * blockIdx.y + threadIdx.y; + const int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j >= dst.cols || i >= dst.rows) return; - - float block_radius2_inv = -1.f/(block_radius * block_radius); + + int bsize = search_radius + block_radius; + int search_window = 2 * search_radius + 1; + float minus_search_window2_inv = -1.f/(search_window * search_window); value_type sum1 = VecTraits::all(0); float sum2 = 0.f; - if (x - search_radius - block_radius >=0 && y - search_radius - block_radius >=0 && - x + search_radius + block_radius < src.cols && y + search_radius + block_radius < src.rows) + if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows) { - - for(float cy = -search_radius; cy <= search_radius; ++cy) - for(float cx = -search_radius; cx <= search_radius; ++cx) - { - float color2 = 0; - for(float by = -block_radius; by <= block_radius; ++by) - for(float bx = -block_radius; bx <= block_radius; ++bx) + for(float y = -search_radius; y <= search_radius; ++y) + for(float x = -search_radius; x <= search_radius; ++x) + { + float dist2 = 0; + for(float ty = -block_radius; ty <= block_radius; ++ty) + for(float tx = -block_radius; tx <= block_radius; ++tx) { - value_type v1 = saturate_cast(src(y + by, x + bx)); - value_type v2 = saturate_cast(src(y + cy + by, x + cx + bx)); - color2 += norm2(v1 - v2); + value_type bv = saturate_cast(src(i + y + ty, j + x + tx)); + value_type av = saturate_cast(src(i + ty, j + tx)); + + dist2 += norm2(av - bv); } - float dist2 = cx * cx + cy * cy; - float w = __expf(color2 * h2_inv_half + dist2 * block_radius2_inv); + float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv); + + /*if (i == 255 && j == 255) + printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/ - sum1 = sum1 + saturate_cast(src(y + cy, x + cy)) * w; + sum1 = sum1 + w * saturate_cast(src(i + y, j + x)); sum2 += w; } } else { - for(float cy = -search_radius; cy <= search_radius; ++cy) - for(float cx = -search_radius; cx <= search_radius; ++cx) + for(float y = -search_radius; y <= search_radius; ++y) + for(float x = -search_radius; x <= search_radius; ++x) { - float color2 = 0; - for(float by = -block_radius; by <= block_radius; ++by) - for(float bx = -block_radius; bx <= block_radius; ++bx) + float dist2 = 0; + for(float ty = -block_radius; ty <= block_radius; ++ty) + for(float tx = -block_radius; tx <= block_radius; ++tx) { - value_type v1 = saturate_cast(b.at(y + by, x + bx, src.data, src.step)); - value_type v2 = saturate_cast(b.at(y + cy + by, x + cx + bx, src.data, src.step)); - color2 += norm2(v1 - v2); + value_type bv = saturate_cast(b.at(i + y + ty, j + x + tx, src)); + value_type av = saturate_cast(b.at(i + ty, j + tx, src)); + dist2 += norm2(av - bv); } + + float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv); - float dist2 = cx * cx + cy * cy; - float w = __expf(color2 * h2_inv_half + dist2 * block_radius2_inv); - - sum1 = sum1 + saturate_cast(b.at(y + cy, x + cy, src.data, src.step)) * w; + sum1 = sum1 + w * saturate_cast(b.at(i + y, j + x, src)); sum2 += w; } } - dst(y, x) = saturate_cast(sum1 / sum2); + dst(i, j) = saturate_cast(sum1 / sum2); } @@ -141,10 +143,12 @@ namespace cv { namespace gpu { namespace device B b(src.rows, src.cols); - float h2_inv_half = -0.5f/(h * h * VecTraits::cn); - + int block_window = 2 * block_radius + 1; + float minus_h2_inv = -1.f/(h * h * VecTraits::cn); + float noise_mult = minus_h2_inv/(block_window * block_window); + cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel >, cudaFuncCachePreferL1) ); - nlm_kernel<<>>((PtrStepSz)src, (PtrStepSz)dst, b, search_radius, block_radius, h2_inv_half); + nlm_kernel<<>>((PtrStepSz)src, (PtrStepSz)dst, b, search_radius, block_radius, noise_mult); cudaSafeCall ( cudaGetLastError () ); if (stream == 0) @@ -184,18 +188,13 @@ namespace cv { namespace gpu { namespace device __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); } __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); } - - template struct FastNonLocalMenas { enum { - CTA_SIZE = 256, - - //TILE_COLS = 256, - //TILE_ROWS = 32, - - TILE_COLS = 256, + CTA_SIZE = 128, + + TILE_COLS = 128, TILE_ROWS = 32, STRIDE = CTA_SIZE @@ -203,7 +202,7 @@ namespace cv { namespace gpu { namespace device struct plus { - __device__ __forceinline float operator()(float v1, float v2) const { return v1 + v2; } + __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; } }; int search_radius; @@ -219,14 +218,14 @@ namespace cv { namespace gpu { namespace device PtrStep src; mutable PtrStepi buffer; - __device__ __forceinline__ void initSums_TileFistColumn(int i, int j, int* dist_sums, PtrStepi& col_dist_sums, PtrStepi& up_col_dist_sums) const + __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const { for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) { dist_sums[index] = 0; for(int tx = 0; tx < block_window; ++tx) - col_dist_sums(tx, index) = 0; + col_sums(tx, index) = 0; int y = index / search_window; int x = index - y * search_window; @@ -240,17 +239,15 @@ namespace cv { namespace gpu { namespace device #if 1 for (int tx = -block_radius; tx <= block_radius; ++tx) { - int col_dist_sums_tx_block_radius_index = 0; - + int col_sum = 0; for (int ty = -block_radius; ty <= block_radius; ++ty) { int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx)); dist_sums[index] += dist; - col_dist_sums_tx_block_radius_index += dist; + col_sum += dist; } - - col_dist_sums(tx + block_radius, index) = col_dist_sums_tx_block_radius_index; + col_sums(tx + block_radius, index) = col_sum; } #else for (int ty = -block_radius; ty <= block_radius; ++ty) @@ -259,16 +256,16 @@ namespace cv { namespace gpu { namespace device int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx)); dist_sums[index] += dist; - col_dist_sums(tx + block_radius, index) += dist; + col_sums(tx + block_radius, index) += dist; } #endif - up_col_dist_sums(j, index) = col_dist_sums(block_window - 1, index); + up_col_sums(j, index) = col_sums(block_window - 1, index); } } - __device__ __forceinline__ void shiftLeftSums_TileFirstRow(int i, int j, int first_col, int* dist_sums, PtrStepi& col_dist_sums, PtrStepi& up_col_dist_sums) const - { + __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const + { for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) { int y = index / search_window; @@ -280,54 +277,46 @@ namespace cv { namespace gpu { namespace device int by = i + y - search_radius; int bx = j + x - search_radius + block_radius; - int col_dist_sum = 0; + int col_sum = 0; for (int ty = -block_radius; ty <= block_radius; ++ty) - col_dist_sum += calcDist(src(ay + ty, ax), src(by + ty, bx)); - - int old_dist_sums = dist_sums[index]; - int old_col_sum = col_dist_sums(first_col, index); - dist_sums[index] += col_dist_sum - old_col_sum; - + col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx)); - col_dist_sums(first_col, index) = col_dist_sum; - up_col_dist_sums(j, index) = col_dist_sum; + dist_sums[index] += col_sum - col_sums(first, index); + + col_sums(first, index) = col_sum; + up_col_sums(j, index) = col_sum; } } - __device__ __forceinline__ void shiftLeftSums_UsingUpSums(int i, int j, int first_col, int* dist_sums, PtrStepi& col_dist_sums, PtrStepi& up_col_dist_sums) const + __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const { int ay = i; int ax = j + block_radius; - int start_by = i - search_radius; - int start_bx = j - search_radius + block_radius; - T a_up = src(ay - block_radius - 1, ax); T a_down = src(ay + block_radius, ax); for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) - { - dist_sums[index] -= col_dist_sums(first_col, index); - + { int y = index / search_window; int x = index - y * search_window; - int by = start_by + y; - int bx = start_bx + x; + int by = i + y - search_radius; + int bx = j + x - search_radius + block_radius; T b_up = src(by - block_radius - 1, bx); T b_down = src(by + block_radius, bx); - int col_dist_sums_first_col_index = up_col_dist_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up); - - col_dist_sums(first_col, index) = col_dist_sums_first_col_index; - dist_sums[index] += col_dist_sums_first_col_index; - up_col_dist_sums(j, index) = col_dist_sums_first_col_index; + int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up); + + dist_sums[index] += col_sum - col_sums(first, index); + col_sums(first, index) = col_sum; + up_col_sums(j, index) = col_sum; } } - __device__ __forceinline__ void convolve_search_window(int i, int j, const int* dist_sums, PtrStepi& col_dist_sums, PtrStepi& up_col_dist_sums, T& dst) const + __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const { typedef typename TypeVec::cn>::vec_type sum_type; @@ -336,8 +325,8 @@ namespace cv { namespace gpu { namespace device float bw2_inv = 1.f/(block_window * block_window); - int start_x = j - search_radius; - int start_y = i - search_radius; + int sx = j - search_radius; + int sy = i - search_radius; for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) { @@ -348,7 +337,7 @@ namespace cv { namespace gpu { namespace device float weight = __expf(avg_dist * minus_h2_inv); weights_sum += weight; - sum = sum + weight * saturate_cast(src(start_y + y, start_x + x)); + sum = sum + weight * saturate_cast(src(sy + y, sx + x)); } volatile __shared__ float cta_buffer[CTA_SIZE]; @@ -357,21 +346,19 @@ namespace cv { namespace gpu { namespace device cta_buffer[tid] = weights_sum; __syncthreads(); - Block::reduce(cta_buffer, plus()); - - if (tid == 0) - weights_sum = cta_buffer[0]; + Block::reduce(cta_buffer, plus()); + weights_sum = cta_buffer[0]; __syncthreads(); + for(int n = 0; n < VecTraits::cn; ++n) { cta_buffer[tid] = reinterpret_cast(&sum)[n]; __syncthreads(); - Block::reduce(cta_buffer, plus()); - - if (tid == 0) - reinterpret_cast(&sum)[n] = cta_buffer[0]; + Block::reduce(cta_buffer, plus()); + reinterpret_cast(&sum)[n] = cta_buffer[0]; + __syncthreads(); } @@ -387,17 +374,17 @@ namespace cv { namespace gpu { namespace device int tex = ::min(tbx + TILE_COLS, dst.cols); int tey = ::min(tby + TILE_ROWS, dst.rows); - PtrStepi col_dist_sums; - col_dist_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window; - col_dist_sums.step = buffer.step; + PtrStepi col_sums; + col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window; + col_sums.step = buffer.step; - PtrStepi up_col_dist_sums; - up_col_dist_sums.data = buffer.data + blockIdx.y * search_window * search_window; - up_col_dist_sums.step = buffer.step; + PtrStepi up_col_sums; + up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window; + up_col_sums.step = buffer.step; extern __shared__ int dist_sums[]; //search_window * search_window - int first_col = -1; + int first = 0; for (int i = tby; i < tey; ++i) for (int j = tbx; j < tex; ++j) @@ -406,22 +393,22 @@ namespace cv { namespace gpu { namespace device if (j == tbx) { - initSums_TileFistColumn(i, j, dist_sums, col_dist_sums, up_col_dist_sums); - first_col = 0; + initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums); + first = 0; } else { if (i == tby) - shiftLeftSums_TileFirstRow(i, j, first_col, dist_sums, col_dist_sums, up_col_dist_sums); + shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums); else - shiftLeftSums_UsingUpSums(i, j, first_col, dist_sums, col_dist_sums, up_col_dist_sums); + shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums); - first_col = (first_col + 1) % block_window; + first = (first + 1) % block_window; } __syncthreads(); - convolve_search_window(i, j, dist_sums, col_dist_sums, up_col_dist_sums, dst(i, j)); + convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j)); } } @@ -463,6 +450,55 @@ namespace cv { namespace gpu { namespace device template void nlm_fast_gpu(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); template void nlm_fast_gpu(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); template void nlm_fast_gpu(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); + + + + __global__ void fnlm_split_kernel(const PtrStepSz lab, PtrStepb l, PtrStep ab) + { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + + if (x < lab.cols && y < lab.rows) + { + uchar3 p = lab(y, x); + ab(y,x) = make_uchar2(p.y, p.z); + l(y,x) = p.x; + } + } + + void fnlm_split_channels(const PtrStepSz& lab, PtrStepb l, PtrStep ab, cudaStream_t stream) + { + dim3 b(32, 8); + dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y)); + + fnlm_split_kernel<<>>(lab, l, ab); + cudaSafeCall ( cudaGetLastError () ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep ab, PtrStepSz lab) + { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + + if (x < lab.cols && y < lab.rows) + { + uchar2 p = ab(y, x); + lab(y, x) = make_uchar3(l(y, x), p.x, p.y); + } + } + + void fnlm_merge_channels(const PtrStepb& l, const PtrStep& ab, PtrStepSz lab, cudaStream_t stream) + { + dim3 b(32, 8); + dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y)); + + fnlm_merge_kernel<<>>(l, ab, lab); + cudaSafeCall ( cudaGetLastError () ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } } }}} diff --git a/modules/gpu/src/denoising.cpp b/modules/gpu/src/denoising.cpp index 0f56c6f..b140011 100644 --- a/modules/gpu/src/denoising.cpp +++ b/modules/gpu/src/denoising.cpp @@ -104,7 +104,7 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f func(src, dst, kernel_size, sigma_spatial, sigma_color, gpuBorderType, StreamAccessor::getStream(s)); } -void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window_size, int block_size, int borderMode, Stream& s) +void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s) { using cv::gpu::device::imgproc::nlm_bruteforce_gpu; typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream); @@ -121,12 +121,9 @@ void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_ int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType)); - - int search_radius = search_window_size/2; - int block_radius = block_size/2; - + dst.create(src.size(), src.type()); - func(src, dst, search_radius, block_radius, h, gpuBorderType, StreamAccessor::getStream(s)); + func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s)); } @@ -143,220 +140,76 @@ namespace cv { namespace gpu { namespace device template void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer, int search_window, int block_window, float h, cudaStream_t stream); - - } + + void fnlm_split_channels(const PtrStepSz& lab, PtrStepb l, PtrStep ab, cudaStream_t stream); + void fnlm_merge_channels(const PtrStepb& l, const PtrStep& ab, PtrStepSz lab, cudaStream_t stream); + } }}} - - -//class CV_EXPORTS FastNonLocalMeansDenoising -//{ -//public: -// FastNonLocalMeansDenoising(float h, int search_radius, int block_radius, const Size& image_size = Size()) -// { -// if (size.area() != 0) -// allocate_buffers(image_size); -// } - -// void operator()(const GpuMat& src, GpuMat& dst); - -//private: -// void allocate_buffers(const Size& image_size) -// { -// col_dist_sums.create(block_window_, search_window_ * search_window_, CV_32S); -// up_col_dist_sums.create(image_size.width, search_window_ * search_window_, CV_32S); -// } - -// int search_radius_; -// int block_radius; -// GpuMat col_dist_sums_; -// GpuMat up_col_dist_sums_; -//}; - -void cv::gpu::fastNlMeansDenoising( const GpuMat& src, GpuMat& dst, float h, int search_radius, int block_radius, Stream& s) +void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s) { - dst.create(src.size(), src.type()); CV_Assert(src.depth() == CV_8U && src.channels() < 4); + + int border_size = search_window/2 + block_window/2; + Size esize = src.size() + Size(border_size, border_size) * 2; + + cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer); + GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step); - GpuMat extended_src, src_hdr; - int border_size = search_radius + block_radius; cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s); - src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size())); + GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size())); + + int bcols, brows; + device::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows); + buffer.create(brows, bcols, CV_32S); using namespace cv::gpu::device::imgproc; typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); - static const nlm_fast_t funcs[] = { nlm_fast_gpu, nlm_fast_gpu, nlm_fast_gpu, 0 }; + static const nlm_fast_t funcs[] = { nlm_fast_gpu, nlm_fast_gpu, nlm_fast_gpu, 0}; + + dst.create(src.size(), src.type()); + funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s)); +} + +void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s) +{ +#if (CUDA_VERSION < 5000) + (void)src; + (void)dst; + (void)h_luminance; + (void)h_color; + (void)search_window; + (void)block_window; + (void)s; + + CV_Error( CV_GpuApiCallError, "Lab method required CUDA 5.0 and higher" ); +#else + + + CV_Assert(src.type() == CV_8UC3); + + lab.create(src.size(), src.type()); + cv::gpu::cvtColor(src, lab, CV_BGR2Lab, 0, s); - int search_window = 2 * search_radius + 1; - int block_window = 2 * block_radius + 1; + /*Mat t; + cv::cvtColor(Mat(src), t, CV_BGR2Lab); + lab.upload(t);*/ + + l.create(src.size(), CV_8U); + ab.create(src.size(), CV_8UC2); + device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s)); - int bcols, brows; - nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows); + simpleMethod(l, l, h_luminance, search_window, block_window, s); + simpleMethod(ab, ab, h_color, search_window, block_window, s); - //GpuMat col_dist_sums(block_window * gx, search_window * search_window * gy, CV_32S); - //GpuMat up_col_dist_sums(src.cols, search_window * search_window * gy, CV_32S); - GpuMat buffer(brows, bcols, CV_32S); + device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s)); + cv::gpu::cvtColor(lab, dst, CV_Lab2BGR, 0, s); - funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s)); -} + /*cv::cvtColor(Mat(lab), t, CV_Lab2BGR); + dst.upload(t);*/ -//void cv::gpu::fastNlMeansDenoisingColored( const GpuMat& src, GpuMat& dst, float h, float hForColorComponents, int templateWindowSize, int searchWindowSize) -//{ -// Mat src = _src.getMat(); -// _dst.create(src.size(), src.type()); -// Mat dst = _dst.getMat(); - -// if (src.type() != CV_8UC3) { -// CV_Error(CV_StsBadArg, "Type of input image should be CV_8UC3!"); -// return; -// } - -// Mat src_lab; -// cvtColor(src, src_lab, CV_LBGR2Lab); - -// Mat l(src.size(), CV_8U); -// Mat ab(src.size(), CV_8UC2); -// Mat l_ab[] = { l, ab }; -// int from_to[] = { 0,0, 1,1, 2,2 }; -// mixChannels(&src_lab, 1, l_ab, 2, from_to, 3); - -// fastNlMeansDenoising(l, l, h, templateWindowSize, searchWindowSize); -// fastNlMeansDenoising(ab, ab, hForColorComponents, templateWindowSize, searchWindowSize); - -// Mat l_ab_denoised[] = { l, ab }; -// Mat dst_lab(src.size(), src.type()); -// mixChannels(l_ab_denoised, 2, &dst_lab, 1, from_to, 3); - -// cvtColor(dst_lab, dst, CV_Lab2LBGR); -//} - -//static void fastNlMeansDenoisingMultiCheckPreconditions( -// const std::vector& srcImgs, -// int imgToDenoiseIndex, int temporalWindowSize, -// int templateWindowSize, int searchWindowSize) -//{ -// int src_imgs_size = (int)srcImgs.size(); -// if (src_imgs_size == 0) { -// CV_Error(CV_StsBadArg, "Input images vector should not be empty!"); -// } - -// if (temporalWindowSize % 2 == 0 || -// searchWindowSize % 2 == 0 || -// templateWindowSize % 2 == 0) { -// CV_Error(CV_StsBadArg, "All windows sizes should be odd!"); -// } - -// int temporalWindowHalfSize = temporalWindowSize / 2; -// if (imgToDenoiseIndex - temporalWindowHalfSize < 0 || -// imgToDenoiseIndex + temporalWindowHalfSize >= src_imgs_size) -// { -// CV_Error(CV_StsBadArg, -// "imgToDenoiseIndex and temporalWindowSize " -// "should be choosen corresponding srcImgs size!"); -// } - -// for (int i = 1; i < src_imgs_size; i++) { -// if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type()) { -// CV_Error(CV_StsBadArg, "Input images should have the same size and type!"); -// } -// } -//} - -//void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst, -// int imgToDenoiseIndex, int temporalWindowSize, -// float h, int templateWindowSize, int searchWindowSize) -//{ -// vector srcImgs; -// _srcImgs.getMatVector(srcImgs); - -// fastNlMeansDenoisingMultiCheckPreconditions( -// srcImgs, imgToDenoiseIndex, -// temporalWindowSize, templateWindowSize, searchWindowSize -// ); -// _dst.create(srcImgs[0].size(), srcImgs[0].type()); -// Mat dst = _dst.getMat(); - -// switch (srcImgs[0].type()) { -// case CV_8U: -// parallel_for(cv::BlockedRange(0, srcImgs[0].rows), -// FastNlMeansMultiDenoisingInvoker( -// srcImgs, imgToDenoiseIndex, temporalWindowSize, -// dst, templateWindowSize, searchWindowSize, h)); -// break; -// case CV_8UC2: -// parallel_for(cv::BlockedRange(0, srcImgs[0].rows), -// FastNlMeansMultiDenoisingInvoker( -// srcImgs, imgToDenoiseIndex, temporalWindowSize, -// dst, templateWindowSize, searchWindowSize, h)); -// break; -// case CV_8UC3: -// parallel_for(cv::BlockedRange(0, srcImgs[0].rows), -// FastNlMeansMultiDenoisingInvoker( -// srcImgs, imgToDenoiseIndex, temporalWindowSize, -// dst, templateWindowSize, searchWindowSize, h)); -// break; -// default: -// CV_Error(CV_StsBadArg, -// "Unsupported matrix format! Only uchar, Vec2b, Vec3b are supported"); -// } -//} - -//void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputArray _dst, -// int imgToDenoiseIndex, int temporalWindowSize, -// float h, float hForColorComponents, -// int templateWindowSize, int searchWindowSize) -//{ -// vector srcImgs; -// _srcImgs.getMatVector(srcImgs); - -// fastNlMeansDenoisingMultiCheckPreconditions( -// srcImgs, imgToDenoiseIndex, -// temporalWindowSize, templateWindowSize, searchWindowSize -// ); - -// _dst.create(srcImgs[0].size(), srcImgs[0].type()); -// Mat dst = _dst.getMat(); - -// int src_imgs_size = (int)srcImgs.size(); - -// if (srcImgs[0].type() != CV_8UC3) { -// CV_Error(CV_StsBadArg, "Type of input images should be CV_8UC3!"); -// return; -// } - -// int from_to[] = { 0,0, 1,1, 2,2 }; - -// // TODO convert only required images -// vector src_lab(src_imgs_size); -// vector l(src_imgs_size); -// vector ab(src_imgs_size); -// for (int i = 0; i < src_imgs_size; i++) { -// src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3); -// l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1); -// ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2); -// cvtColor(srcImgs[i], src_lab[i], CV_LBGR2Lab); - -// Mat l_ab[] = { l[i], ab[i] }; -// mixChannels(&src_lab[i], 1, l_ab, 2, from_to, 3); -// } - -// Mat dst_l; -// Mat dst_ab; - -// fastNlMeansDenoisingMulti( -// l, dst_l, imgToDenoiseIndex, temporalWindowSize, -// h, templateWindowSize, searchWindowSize); - -// fastNlMeansDenoisingMulti( -// ab, dst_ab, imgToDenoiseIndex, temporalWindowSize, -// hForColorComponents, templateWindowSize, searchWindowSize); - -// Mat l_ab_denoised[] = { dst_l, dst_ab }; -// Mat dst_lab(srcImgs[0].size(), srcImgs[0].type()); -// mixChannels(l_ab_denoised, 2, &dst_lab, 1, from_to, 3); - -// cvtColor(dst_lab, dst, CV_Lab2LBGR); -//} +#endif +} #endif diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 09d0d1f..eb1fff7 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -329,11 +329,11 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream); static const caller_t callers[6][4] = { - { copyMakeBorder_caller , 0/*copyMakeBorder_caller*/ , copyMakeBorder_caller , copyMakeBorder_caller}, + { copyMakeBorder_caller , copyMakeBorder_caller , copyMakeBorder_caller , copyMakeBorder_caller}, {0/*copyMakeBorder_caller*/, 0/*copyMakeBorder_caller*/ , 0/*copyMakeBorder_caller*/, 0/*copyMakeBorder_caller*/}, { copyMakeBorder_caller , 0/*copyMakeBorder_caller*/, copyMakeBorder_caller , copyMakeBorder_caller}, { copyMakeBorder_caller , 0/*copyMakeBorder_caller*/ , copyMakeBorder_caller , copyMakeBorder_caller}, - {0/*copyMakeBorder_caller*/ , 0/*copyMakeBorder_caller*/ , 0/*copyMakeBorder_caller*/ , 0/*copyMakeBorder_caller*/}, + {0/*copyMakeBorder_caller*/, 0/*copyMakeBorder_caller*/ , 0/*copyMakeBorder_caller*/, 0/*copyMakeBorder_caller*/}, { copyMakeBorder_caller , 0/*copyMakeBorder_caller*/ , copyMakeBorder_caller , copyMakeBorder_caller} }; diff --git a/modules/gpu/test/test_denoising.cpp b/modules/gpu/test/test_denoising.cpp index 3cec317..b4d177e 100644 --- a/modules/gpu/test/test_denoising.cpp +++ b/modules/gpu/test/test_denoising.cpp @@ -72,9 +72,7 @@ PARAM_TEST_CASE(BilateralFilter, cv::gpu::DeviceInfo, cv::Size, MatType) TEST_P(BilateralFilter, Accuracy) { cv::Mat src = randomMat(size, type); - //cv::Mat src = readImage("hog/road.png", cv::IMREAD_GRAYSCALE); - //cv::Mat src = readImage("csstereobp/aloe-R.png", cv::IMREAD_GRAYSCALE); - + src.convertTo(src, type); cv::gpu::GpuMat dst; @@ -118,16 +116,16 @@ TEST_P(BruteForceNonLocalMeans, Regression) cv::cvtColor(bgr, gray, CV_BGR2GRAY); GpuMat dbgr, dgray; - cv::gpu::nonLocalMeans(GpuMat(bgr), dbgr, 10); - cv::gpu::nonLocalMeans(GpuMat(gray), dgray, 10); + cv::gpu::nonLocalMeans(GpuMat(bgr), dbgr, 20); + cv::gpu::nonLocalMeans(GpuMat(gray), dgray, 20); #if 0 - dumpImage("denoising/denoised_lena_bgr.png", cv::Mat(dbgr)); - dumpImage("denoising/denoised_lena_gray.png", cv::Mat(dgray)); + dumpImage("denoising/nlm_denoised_lena_bgr.png", cv::Mat(dbgr)); + dumpImage("denoising/nlm_denoised_lena_gray.png", cv::Mat(dgray)); #endif - cv::Mat bgr_gold = readImage("denoising/denoised_lena_bgr.png", cv::IMREAD_COLOR); - cv::Mat gray_gold = readImage("denoising/denoised_lena_gray.png", cv::IMREAD_GRAYSCALE); + cv::Mat bgr_gold = readImage("denoising/nlm_denoised_lena_bgr.png", cv::IMREAD_COLOR); + cv::Mat gray_gold = readImage("denoising/nlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty()); EXPECT_MAT_NEAR(bgr_gold, dbgr, 1e-4); @@ -156,27 +154,29 @@ TEST_P(FastNonLocalMeans, Regression) { using cv::gpu::GpuMat; - cv::Mat bgr = readImage("denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR); + cv::Mat bgr = readImage("denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR); ASSERT_FALSE(bgr.empty()); cv::Mat gray; cv::cvtColor(bgr, gray, CV_BGR2GRAY); GpuMat dbgr, dgray; - cv::gpu::fastNlMeansDenoising(GpuMat(gray), dgray, 10); + cv::gpu::FastNonLocalMeansDenoising fnlmd; + + fnlmd.simpleMethod(GpuMat(gray), dgray, 20); + fnlmd.labMethod(GpuMat(bgr), dbgr, 20, 10); #if 0 //dumpImage("denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr)); - dumpImage("denoising/fnlm_denoised_lena_gray.png", cv::Mat(dgray)); + //dumpImage("denoising/fnlm_denoised_lena_gray.png", cv::Mat(dgray)); #endif - //cv::Mat bgr_gold = readImage("denoising/denoised_lena_bgr.png", cv::IMREAD_COLOR); + cv::Mat bgr_gold = readImage("denoising/fnlm_denoised_lena_bgr.png", cv::IMREAD_COLOR); cv::Mat gray_gold = readImage("denoising/fnlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE); - ASSERT_FALSE(/*bgr_gold.empty() || */gray_gold.empty()); - - //EXPECT_MAT_NEAR(bgr_gold, dbgr, 1e-4); - EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4); + ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty()); + EXPECT_MAT_NEAR(bgr_gold, dbgr, 1); + EXPECT_MAT_NEAR(gray_gold, dgray, 1); } INSTANTIATE_TEST_CASE_P(GPU_Denoising, FastNonLocalMeans, ALL_DEVICES); -- 2.7.4