From 61f21078297802bb46377ff3b32229a82f31389a Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 9 Sep 2013 16:13:39 +0400 Subject: [PATCH] added IPP optimization of separable 32f filters; fixed IPP version check in DFT; fixed conditions in IPP optimization of norm functions. --- modules/core/src/dxt.cpp | 2 +- modules/core/src/stat.cpp | 18 +++++++++----- modules/imgproc/src/filter.cpp | 52 +++++++++++++++++++++++++++++++++++++---- modules/imgproc/src/imgwarp.cpp | 8 +++---- 4 files changed, 63 insertions(+), 17 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index a802868..e6fed4e 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -50,7 +50,7 @@ namespace cv # pragma warning(disable: 4748) #endif -#if defined HAVE_IPP && IPP_VERSION_MAJOR >= 7 +#if defined HAVE_IPP && IPP_VERSION_MAJOR*100 + IPP_VERSION_MINOR >= 701 #define USE_IPP_DFT 1 #else #undef USE_IPP_DFT diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index b3fa82c..ff84a34 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1607,13 +1607,15 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) int depth = src.depth(), cn = src.channels(); normType &= 7; - CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR || + CV_Assert( normType == NORM_INF || normType == NORM_L1 || + normType == NORM_L2 || normType == NORM_L2SQR || ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src.type() == CV_8U) ); #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) size_t total_size = src.total(); int rows = src.size[0], cols = (int)(total_size/rows); - if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) + if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous())) + && cols > 0 && (size_t)rows*cols == total_size && (normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ) { IppiSize sz = { cols, rows }; @@ -1900,8 +1902,10 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) ); size_t total_size = src1.total(); int rows = src1.size[0], cols = (int)(total_size/rows); - if( src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) - && (normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ) + if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous())) + && cols > 0 && (size_t)rows*cols == total_size + && (normType == NORM_INF || normType == NORM_L1 || + normType == NORM_L2 || normType == NORM_L2SQR) ) { IppiSize sz = { cols, rows }; int type = src1.type(); @@ -1974,13 +1978,15 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m CV_Assert( src1.size == src2.size && src1.type() == src2.type() ); normType &= 7; - CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR || + CV_Assert( normType == NORM_INF || normType == NORM_L1 || + normType == NORM_L2 || normType == NORM_L2SQR || ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) ); #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) size_t total_size = src1.total(); int rows = src1.size[0], cols = (int)(total_size/rows); - if( src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) + if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous())) + && cols > 0 && (size_t)rows*cols == total_size && (normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ) { IppiSize sz = { cols, rows }; diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index a2bfa6a..1d05d3c 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -46,6 +46,12 @@ Base Image Filter \****************************************************************************************/ +#if defined HAVE_IPP && IPP_VERSION_MAJOR*100 + IPP_VERSION_MINOR >= 701 +#define USE_IPP_SEP_FILTERS 1 +#else +#undef USE_IPP_SEP_FILTERS +#endif + /* Various border types, image boundaries are denoted with '|' @@ -1445,21 +1451,53 @@ struct RowVec_32f RowVec_32f( const Mat& _kernel ) { kernel = _kernel; + haveSSE = checkHardwareSupport(CV_CPU_SSE); +#ifdef USE_IPP_SEP_FILTERS + bufsz = -1; +#endif } int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; + int _ksize = kernel.rows + kernel.cols - 1; + const float* src0 = (const float*)_src; float* dst = (float*)_dst; const float* _kx = (const float*)kernel.data; + +#ifdef USE_IPP_SEP_FILTERS + IppiSize roisz = { width, 1 }; + if( (cn == 1 || cn == 3) && width >= _ksize*8 ) + { + if( bufsz < 0 ) + { + if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) || + (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0)) + return 0; + } + AutoBuffer buf(bufsz + 64); + uchar* bufptr = alignPtr((uchar*)buf, 32); + int step = (int)(width*sizeof(dst[0])*cn); + float borderValue[] = {0.f, 0.f, 0.f}; + // here is the trick. IPP needs border type and extrapolates the row. We did it already. + // So we pass anchor=0 and ignore the right tail of results since they are incorrect there. + if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src0, step, &dst, roisz, _kx, _ksize, 0, + ippBorderRepl, borderValue[0], bufptr) < 0) || + (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src0, step, &dst, roisz, _kx, _ksize, 0, + ippBorderRepl, borderValue, bufptr) < 0)) + return 0; + return width - _ksize + 1; + } +#endif + + if( !haveSSE ) + return 0; + + int i = 0, k; width *= cn; for( ; i <= width - 8; i += 8 ) { - const float* src = (const float*)_src + i; + const float* src = src0 + i; __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; for( k = 0; k < _ksize; k++, src += cn ) { @@ -1478,6 +1516,10 @@ struct RowVec_32f } Mat kernel; + bool haveSSE; +#ifdef USE_IPP_SEP_FILTERS + mutable int bufsz; +#endif }; diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index a4fda28..3bbfe69 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1689,12 +1689,10 @@ public: IppiRect dstroi = { 0, dsty, dstwidth, dstheight - dsty }; int bufsize; ippiResizeGetBufSize( srcroi, dstroi, cn, mode, &bufsize ); - Ipp8u *buf; - buf = ippsMalloc_8u( bufsize ); - IppStatus sts; - if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, buf ) < 0 ) + AutoBuffer buf(bufsize + 64); + uchar* bufptr = alignPtr((uchar*)buf, 32); + if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, bufptr ) < 0 ) *ok = false; - ippsFree(buf); } private: Mat &src; -- 2.7.4