//! returns the separable linear filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
- const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+ const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
//! returns the separable filter engine with the specified filters
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
const Ptr<BaseColumnFilter_GPU> &columnFilter);
//! returns the Gaussian filter engine
- CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+ CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
//! returns filter engine for the generalized Sobel operator
- CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
+ CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) );
//! applies Laplacian operator to the image
// supports only ksize = 1 and ksize = 3
CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
-
/////////////////////////////////// ML ///////////////////////////////////////////
//! Compute closest centers for each lines in source and lable it after center's index
f->apply(src, dst);
}
+const int optimizedSepFilterLocalSize = 16;
+static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst,
+ const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT)
+{
+ size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1};
+ size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
+
+ unsigned int src_pitch = src.step;
+ unsigned int dst_pitch = dst.step;
+
+ int src_offset_x = (src.offset % src.step) / src.elemSize();
+ int src_offset_y = src.offset / src.step;
+
+ std::vector<std::pair<size_t , const void *> > args;
+ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+ args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x ));
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y ));
+
+ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset ));
+ args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
+
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols ));
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows ));
+
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
+
+ string option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1],
+ row_kernel.rows / 2, col_kernel.rows / 2 );
+
+ option += " -D KERNEL_MATRIX_X=";
+ for(int i=0; i<row_kernel.rows; i++)
+ option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &row_kernel.at<float>(i) ) );
+ option += "0x0";
+
+ option += " -D KERNEL_MATRIX_Y=";
+ for(int i=0; i<col_kernel.rows; i++)
+ option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &col_kernel.at<float>(i) ) );
+ option += "0x0";
+
+ switch(src.type())
+ {
+ case CV_8UC1:
+ option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float";
+ break;
+ case CV_32FC1:
+ option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float";
+ break;
+ case CV_8UC2:
+ option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2";
+ break;
+ case CV_32FC2:
+ option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2";
+ break;
+ case CV_8UC3:
+ option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3";
+ break;
+ case CV_32FC3:
+ option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3";
+ break;
+ case CV_8UC4:
+ option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4";
+ break;
+ case CV_32FC4:
+ option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4";
+ break;
+ default:
+ CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
+ break;
+ }
+ switch(dst.type())
+ {
+ case CV_8UC1:
+ option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat";
+ break;
+ case CV_8UC2:
+ option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat";
+ break;
+ case CV_8UC3:
+ option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat";
+ break;
+ case CV_8UC4:
+ option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat";
+ break;
+ case CV_32FC1:
+ option += " -D DSTTYPE=float -D CONVERT_DSTTYPE=";
+ break;
+ case CV_32FC2:
+ option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE=";
+ break;
+ case CV_32FC3:
+ option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE=";
+ break;
+ case CV_32FC4:
+ option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE=";
+ break;
+ default:
+ CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
+ break;
+ }
+ switch(bordertype)
+ {
+ case cv::BORDER_CONSTANT:
+ option += " -D BORDER_CONSTANT";
+ break;
+ case cv::BORDER_REPLICATE:
+ option += " -D BORDER_REPLICATE";
+ break;
+ case cv::BORDER_REFLECT:
+ option += " -D BORDER_REFLECT";
+ break;
+ case cv::BORDER_REFLECT101:
+ option += " -D BORDER_REFLECT_101";
+ break;
+ case cv::BORDER_WRAP:
+ option += " -D BORDER_WRAP";
+ break;
+ default:
+ CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
+ break;
+ }
+
+ openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args,
+ -1, -1, option.c_str() );
+}
+
////////////////////////////////////////////////////////////////////////////////////////////////////
// SeparableFilter
return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
}
+namespace
+{
+class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU
+{
+public:
+ SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype )
+ {
+ bordertype = btype;
+ rowKernel = rowKernel_;
+ columnKernel = columnKernel_;
+ }
+
+ virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
+ {
+ normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size());
+
+ oclMat srcROI = src(roi);
+ oclMat dstROI = dst(roi);
+
+ sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype);
+ }
+
+ Mat rowKernel;
+ Mat columnKernel;
+ int bordertype;
+};
+}
+
+
static void GPUFilterBox(const oclMat &src, oclMat &dst,
Size &ksize, const Point anchor, const int borderType)
{
}
Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType,
- const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype)
+ const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize )
{
int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
int cn = CV_MAT_CN(srcType);
int bdepth = std::max(std::max(sdepth, ddepth), CV_32F);
int bufType = CV_MAKETYPE(bdepth, cn);
- Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
- Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
+ //if image size is non-degenerate and large enough
+ //and if filter support is reasonable to satisfy larger local memory requirements,
+ //then we can use single pass routine to avoid extra runtime calls overhead
+ if( rowKernel.rows <= 21 && columnKernel.rows <= 21 &&
+ (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 &&
+ imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) &&
+ imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) )
+ {
+ return Ptr<FilterEngine_GPU>(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype));
+ }
+ else
+ {
+ Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
+ Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
- return createSeparableFilter_GPU(rowFilter, columnFilter);
+ return createSeparableFilter_GPU(rowFilter, columnFilter);
+ }
}
void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
- Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
+ Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size());
f->apply(src, dst);
}
-Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
+Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize )
{
Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
return createSeparableLinearFilter_GPU(srcType, dstType,
- kx, ky, Point(-1, -1), 0, borderType);
+ kx, ky, Point(-1, -1), 0, borderType, imgSize);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
// Gaussian Filter
-Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype)
+Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize)
{
int depth = CV_MAT_DEPTH(type);
else
ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
- return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
+ return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize);
}
void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
dst.create(src.size(), src.type());
- Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
+ Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size());
f->apply(src, dst);
}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define EXTRAPOLATE(x, maxV)
+#elif defined BORDER_REPLICATE
+//aaaaaa|abcdefgh|hhhhhhh
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ (x) = max(min((x), (maxV) - 1), 0); \
+ }
+#elif defined BORDER_WRAP
+//cdefgh|abcdefgh|abcdefg
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ (x) = ( (x) + (maxV) ) % (maxV); \
+ }
+#elif defined BORDER_REFLECT
+//fedcba|abcdefgh|hgfedcb
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
+ }
+#elif defined BORDER_REFLECT_101
+//gfedcb|abcdefgh|gfedcba
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
+ }
+#else
+#error No extrapolation method
+#endif
+
+#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
+
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
+#else
+#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
+#endif
+
+#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x])
+
+//horizontal and vertical filter kernels
+//should be defined on host during compile time to avoid overhead
+__constant uint mat_kernelX[] = {KERNEL_MATRIX_X};
+__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y};
+
+__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass
+ (
+ __global uchar* Src,
+ const uint SrcPitch,
+ const int srcOffsetX,
+ const int srcOffsetY,
+ __global uchar* Dst,
+ const int DstOffset,
+ const uint DstPitch,
+ int width,
+ int height,
+ int dstWidth,
+ int dstHeight
+ )
+{
+ //RADIUSX, RADIUSY are filter dimensions
+ //BLK_X, BLK_Y are local wrogroup sizes
+ //all these should be defined on host during compile time
+ //first lsmem array for source pixels used in first pass,
+ //second lsmemDy for storing first pass results
+ __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX];
+ __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX];
+
+ //get local and global ids - used as image and local memory array indexes
+ int lix = get_local_id(0);
+ int liy = get_local_id(1);
+
+ int x = (int)get_global_id(0);
+ int y = (int)get_global_id(1);
+
+ //calculate pixel position in source image taking image offset into account
+ int srcX = x + srcOffsetX - RADIUSX;
+ int srcY = y + srcOffsetY - RADIUSY;
+ int xb = srcX;
+ int yb = srcY;
+
+ //extrapolate coordinates, if needed
+ //and read my own source pixel into local memory
+ //with account for extra border pixels, which will be read by starting workitems
+ int clocY = liy;
+ int cSrcY = srcY;
+ do
+ {
+ int yb = cSrcY;
+ EXTRAPOLATE(yb, (height));
+
+ int clocX = lix;
+ int cSrcX = srcX;
+ do
+ {
+ int xb = cSrcX;
+ EXTRAPOLATE(xb,(width));
+ lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
+
+ clocX += BLK_X;
+ cSrcX += BLK_X;
+ }
+ while(clocX < BLK_X+(RADIUSX*2));
+
+ clocY += BLK_Y;
+ cSrcY += BLK_Y;
+ }
+ while(clocY < BLK_Y+(RADIUSY*2));
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //do vertical filter pass
+ //and store intermediate results to second local memory array
+ int i;
+ WORKTYPE sum = 0.0f;
+ int clocX = lix;
+ do
+ {
+ sum = 0.0f;
+ for(i=0; i<=2*RADIUSY; i++)
+ sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum);
+ lsmemDy[liy][clocX] = sum;
+ clocX += BLK_X;
+ }
+ while(clocX < BLK_X+(RADIUSX*2));
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //if this pixel happened to be out of image borders because of global size rounding,
+ //then just return
+ if( x >= dstWidth || y >=dstHeight ) return;
+
+ //do second horizontal filter pass
+ //and calculate final result
+ sum = 0.0f;
+ for(i=0; i<=2*RADIUSX; i++)
+ sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum);
+
+ //store result into destination image
+ DST(x,y) = CONVERT_DSTTYPE(sum);
+}