From 2875ce60ea1c10ed72e393e725c5df6c6238e64f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 20 Mar 2014 18:38:31 +0400 Subject: [PATCH] added 3-channels support to optimized version --- modules/core/include/opencv2/core/mat.hpp | 2 ++ modules/core/src/matrix.cpp | 34 +++++++++++++-------- modules/imgproc/src/filter.cpp | 35 +++++++++++----------- modules/imgproc/src/opencl/filterSep_singlePass.cl | 17 +++++++++-- modules/imgproc/test/ocl/test_sepfilter2D.cpp | 11 +------ 5 files changed, 57 insertions(+), 42 deletions(-) diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 33167fa..6b8368f 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -118,6 +118,8 @@ public: virtual int kind() const; virtual int dims(int i=-1) const; + virtual int cols(int i=-1) const; + virtual int rows(int i=-1) const; virtual Size size(int i=-1) const; virtual int sizend(int* sz, int i=-1) const; virtual bool sameSize(const _InputArray& arr) const; diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index db1ce76..0a8d87d 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -1416,6 +1416,16 @@ int _InputArray::kind() const return flags & KIND_MASK; } +int _InputArray::rows(int i) const +{ + return size(i).height; +} + +int _InputArray::cols(int i) const +{ + return size(i).width; +} + Size _InputArray::size(int i) const { int k = kind(); @@ -2078,45 +2088,45 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int create(2, sizes, mtype, i, allowTransposed, fixedDepthMask); } -void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const +void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const { int k = kind(); if( k == MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 ) { - CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(cols, rows)); + CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(_cols, _rows)); CV_Assert(!fixedType() || ((Mat*)obj)->type() == mtype); - ((Mat*)obj)->create(rows, cols, mtype); + ((Mat*)obj)->create(_rows, _cols, mtype); return; } if( k == UMAT && i < 0 && !allowTransposed && fixedDepthMask == 0 ) { - CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(cols, rows)); + CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(_cols, _rows)); CV_Assert(!fixedType() || ((UMat*)obj)->type() == mtype); - ((UMat*)obj)->create(rows, cols, mtype); + ((UMat*)obj)->create(_rows, _cols, mtype); return; } if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 ) { - CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(cols, rows)); + CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows)); CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype); - ((cuda::GpuMat*)obj)->create(rows, cols, mtype); + ((cuda::GpuMat*)obj)->create(_rows, _cols, mtype); return; } if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 ) { - CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(cols, rows)); + CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows)); CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype); - ((ogl::Buffer*)obj)->create(rows, cols, mtype); + ((ogl::Buffer*)obj)->create(_rows, _cols, mtype); return; } if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 ) { - CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(cols, rows)); + CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows)); CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype); - ((cuda::CudaMem*)obj)->create(rows, cols, mtype); + ((cuda::CudaMem*)obj)->create(_rows, _cols, mtype); return; } - int sizes[] = {rows, cols}; + int sizes[] = {_rows, _cols}; create(2, sizes, mtype, i, allowTransposed, fixedDepthMask); } diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 2d7c740..024f611 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3428,7 +3428,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY const int optimizedSepFilterLocalSize = 16; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, - InputArray _row_kernel, InputArray _col_kernel, + Mat row_kernel, Mat col_kernel, int borderType, int ddepth) { Size size = _src.size(), wholeSize; @@ -3439,7 +3439,7 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, size_t src_step = _src.step(), src_offset = _src.offset(); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if ((src_offset % src_step) % esz != 0 || (!doubleSupport && sdepth == CV_64F) || + if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE || borderType == BORDER_REFLECT || borderType == BORDER_WRAP || borderType == BORDER_REFLECT_101)) @@ -3454,10 +3454,10 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D cn=%d", (int)lt2[0], (int)lt2[1], - _row_kernel.size().height / 2, _col_kernel.size().height / 2, - ocl::kernelToStr(_row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(_col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), + " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1], + row_kernel.cols / 2, col_kernel.cols / 2, + ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], @@ -3486,12 +3486,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, Point anchor, double delta, int borderType ) { + const ocl::Device & d = ocl::Device::getDefault(); Size imgSize = _src.size(); if (abs(delta)> FLT_MIN) return false; - int type = _src.type(), cn = CV_MAT_CN(type); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); if (cn > 4) return false; @@ -3502,21 +3503,21 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (kernelY.cols % 2 != 1) return false; - int sdepth = CV_MAT_DEPTH(type); + if (ddepth < 0) + ddepth = sdepth; + + CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && + imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) && + imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) && + (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), + ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, borderType, ddepth), true) + if (anchor.x < 0) anchor.x = kernelX.cols >> 1; if (anchor.y < 0) anchor.y = kernelY.cols >> 1; - if (ddepth < 0) - ddepth = sdepth; - - CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && - imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && - imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1) && - (borderType & BORDER_ISOLATED) != 0, - ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) - UMat src = _src.getUMat(); Size srcWholeSize; Point srcOffset; src.locateROI(srcWholeSize, srcOffset); diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 705150f..82ccd24 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -74,8 +74,19 @@ #error No extrapolation method #endif -#define SRC(_x,_y) convertToWT(((global srcT*)(Src+(_y)*src_step))[_x]) -#define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x]) +#if CN != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define SRCSIZE (int)sizeof(srcT) +#define DSTSIZE (int)sizeof(dstT) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define SRCSIZE (int)sizeof(srcT1)*3 +#define DSTSIZE (int)sizeof(dstT1)*3 +#endif + +#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x))) #ifdef BORDER_CONSTANT // CCCCCC|abcdefgh|CCCCCCC @@ -172,5 +183,5 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); //store result into destination image - DST(x,y) = convertToDstT(sum); + storepix(convertToDstT(sum), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); } diff --git a/modules/imgproc/test/ocl/test_sepfilter2D.cpp b/modules/imgproc/test/ocl/test_sepfilter2D.cpp index b724641..0aafb8a 100644 --- a/modules/imgproc/test/ocl/test_sepfilter2D.cpp +++ b/modules/imgproc/test/ocl/test_sepfilter2D.cpp @@ -86,16 +86,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); - int rest = roiSize.width % 4; - if (rest != 0) - roiSize.width += (4 - rest); Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); - rest = srcBorder.lef % 4; - if (rest != 0) - srcBorder.lef += (4 - rest); - rest = srcBorder.rig % 4; - if (rest != 0) - srcBorder.rig += (4 - rest); randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); @@ -115,7 +106,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) OCL_TEST_P(SepFilter2D, Mat) { - for (int j = 0; j < test_loop_times + 1; j++) + for (int j = 0; j < test_loop_times + 3; j++) { random_roi(); -- 2.7.4