From 2f1cc018c9be148ebe55976e41ab70faafeee45d Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 21 Aug 2012 17:16:06 +0400 Subject: [PATCH] enabled SSE3 by default; integrated SSE3-optimized bilateral filter (by Grigoriy Frolov); modified API of non-local means (use Input/OutputArrays) --- CMakeLists.txt | 2 +- modules/imgproc/src/smooth.cpp | 237 +++++++++++++++++++-- modules/photo/include/opencv2/photo/denoising.hpp | 28 +-- modules/photo/src/denoising.cpp | 35 ++- .../src/fast_nlmeans_multi_denoising_invoker.hpp | 7 +- 5 files changed, 261 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80cf798..c81cbd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,7 +191,7 @@ OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" OFF IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF (MSVC OR CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 0587685..0fe1b6b 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1294,28 +1294,64 @@ class BilateralFilter_8u_Invoker : public: BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, int* _space_ofs, float *_space_weight, float *_color_weight) : - ParallelLoopBody(), dest(&_dest), temp(&_temp), radius(_radius), + temp(&_temp), dest(&_dest), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) { } - + virtual void operator() (const Range& range) const { int i, j, cn = dest->channels(), k; Size size = dest->size(); - + #if CV_SSE3 + int CV_DECL_ALIGNED(16) buf[4]; + float CV_DECL_ALIGNED(16) bufSum[4]; + static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); + #endif + for( i = range.start; i < range.end; i++ ) { const uchar* sptr = temp->ptr(i+radius) + radius*cn; uchar* dptr = dest->ptr(i); - + if( cn == 1 ) { for( j = 0; j < size.width; j++ ) { float sum = 0, wsum = 0; int val0 = sptr[j]; - for( k = 0; k < maxk; k++ ) + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + __m128 _val0 = _mm_set1_ps(val0); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + + for( ; k <= maxk - 4; k += 4 ) + { + __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], + sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); + + __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0)); + _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val)); + + __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], + color_weight[buf[1]],color_weight[buf[0]]); + __m128 _sw = _mm_loadu_ps(space_weight+k); + __m128 _w = _mm_mul_ps(_cw, _sw); + _cw = _mm_mul_ps(_w, _valF); + + _sw = _mm_hadd_ps(_w, _cw); + _sw = _mm_hadd_ps(_sw, _sw); + _mm_storel_pi((__m64*)bufSum, _sw); + + sum += bufSum[1]; + wsum += bufSum[0]; + } + } + #endif + for( ; k < maxk; k++ ) { int val = sptr[j + space_ofs[k]]; float w = space_weight[k]*color_weight[std::abs(val - val0)]; @@ -1333,7 +1369,57 @@ public: { float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - for( k = 0; k < maxk; k++ ) + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + const __m128 _b0 = _mm_set1_ps(b0); + const __m128 _g0 = _mm_set1_ps(g0); + const __m128 _r0 = _mm_set1_ps(r0); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + + for( ; k <= maxk - 4; k += 4 ) + { + const uchar* sptr_k = sptr + j + space_ofs[k]; + const uchar* sptr_k1 = sptr + j + space_ofs[k+1]; + const uchar* sptr_k2 = sptr + j + space_ofs[k+2]; + const uchar* sptr_k3 = sptr + j + space_ofs[k+3]; + + __m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]); + __m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]); + __m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]); + + __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0)); + __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0)); + __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0)); + + bt =_mm_add_ps(rt, _mm_add_ps(bt, gt)); + _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt)); + + __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], + color_weight[buf[1]],color_weight[buf[0]]); + __m128 _sw = _mm_loadu_ps(space_weight+k); + + _w = _mm_mul_ps(_w,_sw); + _b = _mm_mul_ps(_b, _w); + _g = _mm_mul_ps(_g, _w); + _r = _mm_mul_ps(_r, _w); + + _w = _mm_hadd_ps(_w, _b); + _g = _mm_hadd_ps(_g, _r); + + _w = _mm_hadd_ps(_w, _g); + _mm_store_ps(bufSum, _w); + + wsum += bufSum[0]; + sum_b += bufSum[1]; + sum_g += bufSum[2]; + sum_r += bufSum[3]; + } + } + #endif + + for( ; k < maxk; k++ ) { const uchar* sptr_k = sptr + j + space_ofs[k]; int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; @@ -1351,10 +1437,10 @@ public: } } } - + private: - Mat *dest; const Mat *temp; + Mat *dest; int radius, maxk, *space_ofs; float *space_weight, *color_weight; }; @@ -1364,46 +1450,51 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + int cn = src.channels(); int i, j, maxk, radius; Size size = src.size(); - + CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.type() == dst.type() && src.size() == dst.size() && src.data != dst.data ); - + if( sigma_color <= 0 ) sigma_color = 1; if( sigma_space <= 0 ) sigma_space = 1; - + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - + if( d <= 0 ) radius = cvRound(sigma_space*1.5); else radius = d/2; radius = MAX(radius, 1); d = radius*2 + 1; - + Mat temp; copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - + vector _color_weight(cn*256); vector _space_weight(d*d); vector _space_ofs(d*d); float* color_weight = &_color_weight[0]; float* space_weight = &_space_weight[0]; int* space_ofs = &_space_ofs[0]; - + // initialize color-related bilateral filter coefficients + for( i = 0; i < 256*cn; i++ ) color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - + // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) + { + j = -radius; + + for( ;j <= radius; j++ ) { double r = std::sqrt((double)i*i + (double)j*j); if( r > radius ) @@ -1411,7 +1502,8 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_ofs[maxk++] = (int)(i*temp.step + j*cn); } - + } + BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); parallel_for_(Range(0, size.height), body); } @@ -1424,7 +1516,7 @@ public: BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : - ParallelLoopBody(), cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), + cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) { } @@ -1433,6 +1525,12 @@ public: { int i, j, k; Size size = dest->size(); + #if CV_SSE3 + int CV_DECL_ALIGNED(16) idxBuf[4]; + float CV_DECL_ALIGNED(16) bufSum32[4]; + static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); + #endif for( i = range.start; i < range.end; i++ ) { @@ -1445,7 +1543,44 @@ public: { float sum = 0, wsum = 0; float val0 = sptr[j]; - for( k = 0; k < maxk; k++ ) + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + const __m128 _val0 = _mm_set1_ps(sptr[j]); + const __m128 _scale_index = _mm_set1_ps(scale_index); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + + for( ; k <= maxk - 4 ; k += 4 ) + { + __m128 _sw = _mm_loadu_ps(space_weight + k); + __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], + sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); + __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index); + + __m128i _idx = _mm_cvtps_epi32(_alpha); + _mm_store_si128((__m128i*)idxBuf, _idx); + _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); + + __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], + expLUT[idxBuf[1]], expLUT[idxBuf[0]]); + __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], + expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); + + __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); + _val = _mm_mul_ps(_w, _val); + + _sw = _mm_hadd_ps(_w, _val); + _sw = _mm_hadd_ps(_sw, _sw); + _mm_storel_pi((__m64*)bufSum32, _sw); + + sum += bufSum32[1]; + wsum += bufSum32[0]; + } + } + #endif + + for( ; k < maxk; k++ ) { float val = sptr[j + space_ofs[k]]; float alpha = (float)(std::abs(val - val0)*scale_index); @@ -1465,7 +1600,64 @@ public: { float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - for( k = 0; k < maxk; k++ ) + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + const __m128 _b0 = _mm_set1_ps(b0); + const __m128 _g0 = _mm_set1_ps(g0); + const __m128 _r0 = _mm_set1_ps(r0); + const __m128 _scale_index = _mm_set1_ps(scale_index); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + + for( ; k <= maxk-4; k += 4 ) + { + __m128 _sw = _mm_loadu_ps(space_weight + k); + + const float* sptr_k = sptr + j + space_ofs[k]; + const float* sptr_k1 = sptr + j + space_ofs[k+1]; + const float* sptr_k2 = sptr + j + space_ofs[k+2]; + const float* sptr_k3 = sptr + j + space_ofs[k+3]; + + __m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]); + __m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]); + __m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]); + + __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0)); + __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0)); + __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0)); + + __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt))); + + __m128i _idx = _mm_cvtps_epi32(_alpha); + _mm_store_si128((__m128i*)idxBuf, _idx); + _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); + + __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]); + __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); + + __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); + + _b = _mm_mul_ps(_b, _w); + _g = _mm_mul_ps(_g, _w); + _r = _mm_mul_ps(_r, _w); + + _w = _mm_hadd_ps(_w, _b); + _g = _mm_hadd_ps(_g, _r); + + _w = _mm_hadd_ps(_w, _g); + _mm_store_ps(bufSum32, _w); + + wsum += bufSum32[0]; + sum_b += bufSum32[1]; + sum_g += bufSum32[2]; + sum_r += bufSum32[3]; + } + + } + #endif + + for(; k < maxk; k++ ) { const float* sptr_k = sptr + j + space_ofs[k]; float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; @@ -1493,6 +1685,7 @@ private: Mat *dest; float scale_index, *space_weight, *expLUT; }; + static void bilateralFilter_32f( const Mat& src, Mat& dst, int d, @@ -1569,7 +1762,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, } // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) + for( i = -radius, maxk = 0; i <= radius; i++ ) for( j = -radius; j <= radius; j++ ) { double r = std::sqrt((double)i*i + (double)j*j); diff --git a/modules/photo/include/opencv2/photo/denoising.hpp b/modules/photo/include/opencv2/photo/denoising.hpp index b322c31..1fc35a6 100644 --- a/modules/photo/include/opencv2/photo/denoising.hpp +++ b/modules/photo/include/opencv2/photo/denoising.hpp @@ -55,23 +55,23 @@ namespace cv { -CV_EXPORTS void fastNlMeansDenoising( const Mat& src, Mat& dst, - int templateWindowSize, int searchWindowSize, int h); +CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, + int templateWindowSize, int searchWindowSize, int h); -CV_EXPORTS void fastNlMeansDenoisingColored( const Mat& src, Mat& dst, - int templateWindowSize, int searchWindowSize, - int h, int hForColorComponents); +CV_EXPORTS_W void fastNlMeansDenoisingColored( InputArray src, OutputArray dst, + int templateWindowSize, int searchWindowSize, + int h, int hForColorComponents); -CV_EXPORTS void fastNlMeansDenoisingMulti( const std::vector& srcImgs, - int imgToDenoiseIndex, int temporalWindowSize, - Mat& dst, - int templateWindowSize, int searchWindowSize, int h); +CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, + int imgToDenoiseIndex, int temporalWindowSize, + OutputArray dst, + int templateWindowSize, int searchWindowSize, int h); -CV_EXPORTS void fastNlMeansDenoisingColoredMulti( const std::vector& srcImgs, - int imgToDenoiseIndex, int temporalWindowSize, - Mat& dst, - int templateWindowSize, int searchWindowSize, - int h, int hForColorComponents); +CV_EXPORTS_W void fastNlMeansDenoisingColoredMulti( InputArrayOfArrays srcImgs, + int imgToDenoiseIndex, int temporalWindowSize, + OutputArray dst, + int templateWindowSize, int searchWindowSize, + int h, int hForColorComponents); } #endif diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index b980b40..7452d0d 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -45,9 +45,13 @@ #include "fast_nlmeans_denoising_invoker.hpp" #include "fast_nlmeans_multi_denoising_invoker.hpp" -void cv::fastNlMeansDenoising( const cv::Mat& src, cv::Mat& dst, +void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, int templateWindowSize, int searchWindowSize, int h) -{ +{ + Mat src = _src.getMat(); + _dst.create(src.size(), src.type()); + Mat dst = _dst.getMat(); + switch (src.type()) { case CV_8U: parallel_for(cv::BlockedRange(0, src.rows), @@ -70,10 +74,14 @@ void cv::fastNlMeansDenoising( const cv::Mat& src, cv::Mat& dst, } } -void cv::fastNlMeansDenoisingColored( const cv::Mat& src, cv::Mat& dst, +void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, int templateWindowSize, int searchWindowSize, int h, int hForColorComponents) { + Mat src = _src.getMat(); + _dst.create(src.size(), src.type()); + Mat dst = _dst.getMat(); + if (src.type() != CV_8UC3) { CV_Error(CV_StsBadArg, "Type of input image should be CV_8UC3!"); return; @@ -130,15 +138,20 @@ static void fastNlMeansDenoisingMultiCheckPreconditions( } } -void cv::fastNlMeansDenoisingMulti( const std::vector& srcImgs, +void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, int imgToDenoiseIndex, int temporalWindowSize, - cv::Mat& dst, + OutputArray _dst, int templateWindowSize, int searchWindowSize, int h) -{ +{ + vector srcImgs; + _srcImgs.getMatVector(srcImgs); + fastNlMeansDenoisingMultiCheckPreconditions( srcImgs, imgToDenoiseIndex, temporalWindowSize, templateWindowSize, searchWindowSize ); + _dst.create(srcImgs[0].size(), srcImgs[0].type()); + Mat dst = _dst.getMat(); switch (srcImgs[0].type()) { case CV_8U: @@ -165,16 +178,22 @@ void cv::fastNlMeansDenoisingMulti( const std::vector& srcImgs, } } -void cv::fastNlMeansDenoisingColoredMulti( const std::vector& srcImgs, +void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, int imgToDenoiseIndex, int temporalWindowSize, - cv::Mat& dst, + OutputArray _dst, int templateWindowSize, int searchWindowSize, int h, int hForColorComponents) { + vector srcImgs; + _srcImgs.getMatVector(srcImgs); + fastNlMeansDenoisingMultiCheckPreconditions( srcImgs, imgToDenoiseIndex, temporalWindowSize, templateWindowSize, searchWindowSize ); + + _dst.create(srcImgs[0].size(), srcImgs[0].type()); + Mat dst = _dst.getMat(); int src_imgs_size = (int)srcImgs.size(); diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp index 02185d1..602007e 100644 --- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp @@ -270,9 +270,9 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const BlockedRange& range) estimation[channel_num] = 0; } for (int d = 0; d < temporal_window_size_; d++) { + const Mat& esrc_d = extended_srcs_[d]; for (int y = 0; y < search_window_size_; y++) { - const T* cur_row_ptr = - extended_srcs_[d].ptr(border_size_ + search_window_y + y); + const T* cur_row_ptr = esrc_d.ptr(border_size_ + search_window_y + y); int* dist_sums_row = dist_sums.row_ptr(d, y); @@ -298,7 +298,8 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const BlockedRange& range) dst_.at(i,j) = saturateCastFromArray(estimation); } else { // weights_sum == 0 - dst_.at(i,j) = extended_srcs_[temporal_window_half_size_].at(i,j); + const Mat& esrc = extended_srcs_[temporal_window_half_size_]; + dst_.at(i,j) = esrc.at(i,j); } } } -- 2.7.4