From 0abe7f319687883669f113ff2716cf2fa7e67886 Mon Sep 17 00:00:00 2001 From: niko Date: Fri, 21 Sep 2012 16:51:45 +0800 Subject: [PATCH] many bugs fix for intel's HD4000 from my svn 855 --- .../ocl/include/opencv2/ocl/matrix_operations.hpp | 11 +- modules/ocl/include/opencv2/ocl/ocl.hpp | 9 +- modules/ocl/perf/perf_imgproc.cpp | 32 +- modules/ocl/src/arithm.cpp | 35 +- modules/ocl/src/filtering.cpp | 68 +- modules/ocl/src/imgproc.cpp | 622 +++++---- modules/ocl/src/initialization.cpp | 4 +- modules/ocl/src/kernels/arithm_addWeighted.cl | 13 +- modules/ocl/src/kernels/arithm_div.cl | 8 +- modules/ocl/src/kernels/arithm_minMaxLoc.cl | 147 +-- modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl | 112 +- modules/ocl/src/kernels/arithm_mul.cl | 13 +- modules/ocl/src/kernels/arithm_pow.cl | 15 +- modules/ocl/src/kernels/convertC3C4.cl | 1 - modules/ocl/src/kernels/filtering_boxFilter.cl | 29 +- modules/ocl/src/kernels/img_proc.cl | 1331 -------------------- modules/ocl/src/kernels/imgproc_bilateral.cl | 95 +- modules/ocl/src/kernels/imgproc_calcHarris.cl | 14 +- modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl | 14 +- modules/ocl/src/kernels/imgproc_canny.cl | 3 +- modules/ocl/src/kernels/imgproc_copymakeboder.cl | 342 +++-- modules/ocl/src/kernels/imgproc_histogram.cl | 8 +- modules/ocl/src/kernels/imgproc_integral_sum.cl | 4 +- modules/ocl/src/kernels/imgproc_resize.cl | 18 +- modules/ocl/src/kernels/imgproc_warpPerspective.cl | 867 +++++++------ modules/ocl/src/matrix_operations.cpp | 14 +- modules/ocl/src/precomp.hpp | 6 +- modules/ocl/src/split_merge.cpp | 32 +- modules/ocl/test/main.cpp | 2 +- modules/ocl/test/test_arithm.cpp | 18 +- modules/ocl/test/test_filters.cpp | 8 +- modules/ocl/test/test_imgproc.cpp | 145 ++- modules/ocl/test/test_split_merge.cpp | 25 +- modules/ocl/test/utility.hpp | 7 +- 34 files changed, 1491 insertions(+), 2581 deletions(-) delete mode 100644 modules/ocl/src/kernels/img_proc.cl diff --git a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp index 89fce5a..7db34f8 100644 --- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp +++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp @@ -98,8 +98,8 @@ namespace cv } inline oclMat::oclMat(int _rows, int _cols, int _type, void *_data, size_t _step) - : flags(Mat::MAGIC_VAL + (_type &TYPE_MASK)), rows(_rows), cols(_cols), step(_step), data((uchar *)_data), refcount(0), - datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_rows), wholecols(_cols), download_channels(CV_MAT_CN(_type)) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), + datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0) { cv::Mat m(_rows,_cols,_type,_data,_step); upload(m); @@ -119,9 +119,9 @@ namespace cv } inline oclMat::oclMat(Size _size, int _type, void *_data, size_t _step) - : flags(Mat::MAGIC_VAL + (_type &TYPE_MASK)), rows(_size.height), cols(_size.width), - step(_step), data((uchar *)_data), refcount(0), - datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_size.height), wholecols(_size.width), download_channels(CV_MAT_CN(_type)) + : flags(0), rows(0), cols(0), + step(0), data(0), refcount(0), + datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0), download_channels(0) { cv::Mat m(_size,_type,_data,_step); upload(m); @@ -327,6 +327,7 @@ namespace cv std::swap( dataend, b.dataend ); std::swap( refcount, b.refcount ); std::swap( offset, b.offset ); + std::swap( clCxt, b.clCxt ); std::swap( wholerows, b.wholerows ); std::swap( wholecols, b.wholecols ); std::swap( download_channels, b.download_channels); diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index b0d6d83..961831a 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -615,14 +615,17 @@ namespace cv //! erodes the image (applies the local minimum operator) // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 - CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, + int borderType=BORDER_CONSTANT,const Scalar& borderValue=morphologyDefaultBorderValue()); //! dilates the image (applies the local maximum operator) // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 - CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, + int borderType=BORDER_CONSTANT,const Scalar& borderValue=morphologyDefaultBorderValue()); //! applies an advanced morphological operation to the image - CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, + int borderType=BORDER_CONSTANT,const Scalar& borderValue=morphologyDefaultBorderValue()); ////////////////////////////// Image processing ////////////////////////////// //! Does mean shift filtering on GPU. diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp index c1faf6f..9b2b995 100644 --- a/modules/ocl/perf/perf_imgproc.cpp +++ b/modules/ocl/perf/perf_imgproc.cpp @@ -432,10 +432,13 @@ struct CopyMakeBorder : ImgprocTestBase {}; TEST_P(CopyMakeBorder, Mat) { - int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; + int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE,cv::BORDER_REFLECT,cv::BORDER_WRAP,cv::BORDER_REFLECT_101}; //const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; - - if ((mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4 && mat1.type() != CV_32SC1) || mat1.type() != dst.type()) + int top=5; + int bottom=5; + int left=6; + int right=6; + if (mat1.type() != dst.type()) { cout<<"Unsupported type"< impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -1591,7 +1591,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal, size_t groupnum = src.clCxt->impl->maxComputeUnits; CV_Assert(groupnum != 0); int minloc = -1 , maxloc = -1; - int vlen = 8, dbsize = groupnum * vlen * 4 * sizeof(T) , status; + int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) , status; Context *clCxt = src.clCxt; cl_mem dstBuffer = openCLCreateBuffer(clCxt,CL_MEM_WRITE_ONLY,dbsize); *minVal = std::numeric_limits::max() , *maxVal = -std::numeric_limits::max(); @@ -1979,7 +1979,7 @@ void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const o void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) { - if(src.clCxt -> impl -> double_support ==0) + if(src.clCxt -> impl -> double_support ==0 && src.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -1992,7 +1992,7 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst) void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { // dst.create(src1.size(),src1.type()); - if(src1.clCxt -> impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -2008,7 +2008,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - if(src1.clCxt -> impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -2023,7 +2023,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { // dst.create(src1.size(),src1.type()); - if(src1.clCxt -> impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -2040,7 +2040,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - if(src1.clCxt -> impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -2054,7 +2054,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask) { - if(src1.clCxt -> impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -2073,7 +2073,7 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask) { - if(src1.clCxt -> impl -> double_support ==0) + if(src1.clCxt -> impl -> double_support ==0 && src1.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; @@ -2224,9 +2224,10 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, } else { - args.push_back( make_pair( sizeof(cl_float), (void *)&alpha )); - args.push_back( make_pair( sizeof(cl_float), (void *)&beta )); - args.push_back( make_pair( sizeof(cl_float), (void *)&gama )); + float alpha_f=alpha,beta_f=beta,gama_f=gama; + args.push_back( make_pair( sizeof(cl_float), (void *)&alpha_f )); + args.push_back( make_pair( sizeof(cl_float), (void *)&beta_f )); + args.push_back( make_pair( sizeof(cl_float), (void *)&gama_f )); } args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); @@ -2363,13 +2364,19 @@ void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernel args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - args.push_back( make_pair( sizeof(cl_double), (void *)&p )); + if(src1.clCxt -> impl -> double_support ==0) + { + float pf = p; + args.push_back( make_pair( sizeof(cl_float), (void *)&pf )); + } + else + args.push_back( make_pair( sizeof(cl_double), (void *)&p )); openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); } void cv::ocl::pow(const oclMat &x, double p, oclMat &y) { - if(x.clCxt -> impl -> double_support ==0) + if(x.clCxt -> impl -> double_support ==0 && x.type()==CV_64F) { cout << "Selected device do not support double" << endl; return; diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index bbce181..19351bf 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -539,8 +539,12 @@ Ptr cv::ocl::createMorphologyFilter_GPU(int op, int type, cons namespace { - void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations) + void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations,int borderType,const Scalar& borderValue) { + if((borderType != cv::BORDER_CONSTANT) || (borderValue!=morphologyDefaultBorderValue())) + { + CV_Error(CV_StsBadArg,"unsupported border type"); + } Mat kernel; Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); @@ -576,7 +580,8 @@ namespace } } -void cv::ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations) +void cv::ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, + int borderType,const Scalar& borderValue) { bool allZero = true; for(int i = 0; i < kernel.rows * kernel.cols; ++i) @@ -586,46 +591,48 @@ void cv::ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point an { kernel.data[0] = 1; } - morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations); + morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations,borderType, borderValue); } -void cv::ocl::dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations) +void cv::ocl::dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations, + int borderType,const Scalar& borderValue) { - morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations); + morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations,borderType, borderValue); } -void cv::ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations) +void cv::ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations, + int borderType,const Scalar& borderValue) { oclMat temp; switch( op ) { case MORPH_ERODE: - erode( src, dst, kernel, anchor, iterations); + erode( src, dst, kernel, anchor, iterations,borderType, borderValue); break; case MORPH_DILATE: - dilate( src, dst, kernel, anchor, iterations); + dilate( src, dst, kernel, anchor, iterations,borderType, borderValue); break; case MORPH_OPEN: - erode( src, temp, kernel, anchor, iterations); - dilate( temp, dst, kernel, anchor, iterations); + erode( src, temp, kernel, anchor, iterations,borderType, borderValue); + dilate( temp, dst, kernel, anchor, iterations,borderType, borderValue); break; case CV_MOP_CLOSE: - dilate( src, temp, kernel, anchor, iterations); - erode( temp, dst, kernel, anchor, iterations); + dilate( src, temp, kernel, anchor, iterations,borderType, borderValue); + erode( temp, dst, kernel, anchor, iterations,borderType, borderValue); break; case CV_MOP_GRADIENT: - erode( src, temp, kernel, anchor, iterations); - dilate( src, dst, kernel, anchor, iterations); + erode( src, temp, kernel, anchor, iterations,borderType, borderValue); + dilate( src, dst, kernel, anchor, iterations,borderType, borderValue); subtract(dst, temp, dst); break; case CV_MOP_TOPHAT: - erode( src, dst, kernel, anchor, iterations); - dilate( dst, temp, kernel, anchor, iterations); + erode( src, dst, kernel, anchor, iterations,borderType, borderValue); + dilate( dst, temp, kernel, anchor, iterations,borderType, borderValue); subtract(src, temp, dst); break; case CV_MOP_BLACKHAT: - dilate( src, dst, kernel, anchor, iterations); - erode( dst, temp, kernel, anchor, iterations); + dilate( src, dst, kernel, anchor, iterations,borderType, borderValue); + erode( dst, temp, kernel, anchor, iterations,borderType, borderValue); subtract(temp, src, dst); break; default: @@ -1434,6 +1441,18 @@ Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) { + if((dst.cols!=dst.wholecols) || (dst.rows!=dst.wholerows))//has roi + { + if((bordertype & cv::BORDER_ISOLATED) != 0) + { + bordertype &= ~cv::BORDER_ISOLATED; + if((bordertype != cv::BORDER_CONSTANT) && + (bordertype != cv::BORDER_REPLICATE)) + { + CV_Error(CV_StsBadArg,"unsupported border type"); + } + } + } if( ddepth < 0 ) ddepth = src.depth(); //CV_Assert(ddepth == src.depth()); @@ -1557,7 +1576,18 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si src.copyTo(dst); return; } - + if((dst.cols!=dst.wholecols) || (dst.rows!=dst.wholerows))//has roi + { + if((bordertype & cv::BORDER_ISOLATED) != 0) + { + bordertype &= ~cv::BORDER_ISOLATED; + if((bordertype != cv::BORDER_CONSTANT) && + (bordertype != cv::BORDER_REPLICATE)) + { + CV_Error(CV_StsBadArg,"unsupported border type"); + } + } + } dst.create(src.size(), src.type()); if( bordertype != BORDER_CONSTANT ) { diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 9a8e05d..06721b0 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -117,7 +117,6 @@ namespace cv ////////////////////////////////////OpenCL kernel strings////////////////////////// extern const char *meanShift; - extern const char *img_proc; extern const char *imgproc_copymakeboder; extern const char *imgproc_median; extern const char *imgproc_threshold; @@ -131,7 +130,7 @@ namespace cv extern const char *imgproc_bilateral; extern const char *imgproc_calcHarris; extern const char *imgproc_calcMinEigenVal; - extern const char *imgproc_convolve; + extern const char *imgproc_convolve; ////////////////////////////////////OpenCL call wrappers//////////////////////////// template struct index_and_sizeof; @@ -415,7 +414,8 @@ namespace cv } else { - args.push_back( make_pair(sizeof(cl_float4),(void*)&borderValue)); + float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]}; + args.push_back( make_pair(sizeof(cl_float4),(void*)&borderFloat)); } } if(map1.channels() == 1) @@ -444,7 +444,8 @@ namespace cv } else { - args.push_back( make_pair(sizeof(cl_float4),(void*)&borderValue)); + float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]}; + args.push_back( make_pair(sizeof(cl_float4),(void*)&borderFloat)); } } openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth()); @@ -478,13 +479,13 @@ namespace cv if(src.type() == CV_8UC1) { size_t cols = (dst.cols + dst.offset % 4 + 3) / 4; - glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; + glbSizeX = cols % blkSizeX == 0 && cols != 0? cols : (cols / blkSizeX + 1) * blkSizeX; } else { - glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX; + glbSizeX = dst.cols % blkSizeX == 0 && dst.cols !=0? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX; } - size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY; + size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY; size_t globalThreads[3] = {glbSizeX, glbSizeY, 1}; size_t localThreads[3] = {blkSizeX, blkSizeY, 1}; @@ -545,7 +546,7 @@ namespace cv { if(dsize.width != (int)(src.cols * fx) || dsize.height != (int)(src.rows * fy)) { - std::cout << "invalid dsize and fx, fy!" << std::endl; + CV_Error(CV_StsUnmatchedSizes,"invalid dsize and fx, fy!"); } } if( dsize == Size() ) @@ -629,108 +630,239 @@ namespace cv //////////////////////////////////////////////////////////////////////// // copyMakeBorder - void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int left, int boardtype, void *nVal) + void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar) { - CV_Assert( (src.channels() == dst.channels()) ); - + //CV_Assert(src.channels() != 2); + CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0); + if((dst.cols!=dst.wholecols) || (dst.rows!=dst.wholerows))//has roi + { + if(((bordertype & cv::BORDER_ISOLATED) == 0) && + (bordertype != cv::BORDER_CONSTANT) && + (bordertype != cv::BORDER_REPLICATE)) + { + CV_Error(CV_StsBadArg,"unsupported border type"); + } + } + bordertype &= ~cv::BORDER_ISOLATED; + if((bordertype == cv::BORDER_REFLECT) || (bordertype == cv::BORDER_WRAP)) + { + CV_Assert((src.cols>=left) && (src.cols>=right) && (src.rows >= top) && (src.rows >= bottom)); + } + if(bordertype == cv::BORDER_REFLECT_101) + { + CV_Assert((src.cols>left) && (src.cols>right) && (src.rows > top) && (src.rows > bottom)); + } + dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); int srcStep = src.step1() / src.channels(); int dstStep = dst.step1() / dst.channels(); - int srcOffset = src.offset / src.channels() / src.elemSize1(); - int dstOffset = dst.offset / dst.channels() / dst.elemSize1(); - - int D = src.depth(); - int V32 = *(int *)nVal; - char V8 = *(char *)nVal; - if(src.channels() == 4) - { - unsigned int v = 0x01020408; - char *pv = (char *)(&v); - uchar *pnVal = (uchar *)(nVal); - if(((*pv) & 0x01) != 0) - V32 = (pnVal[0] << 24) + (pnVal[1] << 16) + (pnVal[2] << 8) + (pnVal[3]); - else - V32 = (pnVal[3] << 24) + (pnVal[2] << 16) + (pnVal[1] << 8) + (pnVal[0]); - - srcStep = src.step / 4; - dstStep = dst.step / 4; - - D = 4; - } - - Context *clCxt = src.clCxt; - string kernelName = "copyConstBorder"; - if(boardtype == BORDER_REPLICATE) - kernelName = "copyReplicateBorder"; - else if(boardtype == BORDER_REFLECT_101) - kernelName = "copyReflectBorder"; - + int srcOffset = src.offset / src.elemSize(); + int dstOffset = dst.offset / dst.elemSize(); + int __bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101}; + const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"}; + int bordertype_index; + for(bordertype_index=0;bordertype_index > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); args.push_back( make_pair( sizeof(cl_int), (void *)&top)); args.push_back( make_pair( sizeof(cl_int), (void *)&left)); - if(D == 0) - args.push_back( make_pair( sizeof(uchar), (void *)&V8)); - else - args.push_back( make_pair( sizeof(int), (void *)&V32)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); - - size_t globalThreads[3] = {((dst.cols + 6) / 4 * dst.rows + 255) / 256 * 256, 1, 1}; - size_t localThreads[3] = {256, 1, 1}; - - openCLExecuteKernel(clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, 1, D); -/* uchar* cputemp=new uchar[32*dst.wholerows]; - //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; - openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE, - 0, 32*dst.wholerows, cputemp, 0, NULL, NULL)); - for(int i=0;i(scalar.val[0]); + val.uval.s[1] = saturate_cast(scalar.val[1]); + val.uval.s[2] = saturate_cast(scalar.val[2]); + val.uval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=uchar -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); + if(((dst.offset & 3) ==0) && ((dst.cols & 3) == 0)) + { + kernelName = "copymakeborder_C1_D0"; + globalThreads[0] = (dst.cols/4 + localThreads[0]-1) / localThreads[0] * localThreads[0]; + } + break; + case 4: + sprintf(compile_option, "-D GENTYPE=uchar4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + case CV_8S: + val.cval.s[0] = saturate_cast(scalar.val[0]); + val.cval.s[1] = saturate_cast(scalar.val[1]); + val.cval.s[2] = saturate_cast(scalar.val[2]); + val.cval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=char -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=char4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + case CV_16U: + val.usval.s[0] = saturate_cast(scalar.val[0]); + val.usval.s[1] = saturate_cast(scalar.val[1]); + val.usval.s[2] = saturate_cast(scalar.val[2]); + val.usval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=ushort -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=ushort4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + case CV_16S: + val.shval.s[0] = saturate_cast(scalar.val[0]); + val.shval.s[1] = saturate_cast(scalar.val[1]); + val.shval.s[2] = saturate_cast(scalar.val[2]); + val.shval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=short -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=short4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + case CV_32S: + val.ival.s[0] = saturate_cast(scalar.val[0]); + val.ival.s[1] = saturate_cast(scalar.val[1]); + val.ival.s[2] = saturate_cast(scalar.val[2]); + val.ival.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=int -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); + break; + case 2: + sprintf(compile_option, "-D GENTYPE=int2 -D %s",borderstr[bordertype_index]); + cl_int2 i2val; + i2val.s[0] = val.ival.s[0]; + i2val.s[1] = val.ival.s[1]; + args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=int4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + case CV_32F: + val.fval.s[0] = scalar.val[0]; + val.fval.s[1] = scalar.val[1]; + val.fval.s[2] = scalar.val[2]; + val.fval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=float -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=float4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + case CV_64F: + val.dval.s[0] = scalar.val[0]; + val.dval.s[1] = scalar.val[1]; + val.dval.s[2] = scalar.val[2]; + val.dval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=double -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=double4 -D %s",borderstr[bordertype_index]); + args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); + } + break; + default: + CV_Error(CV_StsUnsupportedFormat,"unknown depth"); } - cout<= 0 && bottom >= 0 && left >= 0 && right >= 0); - - dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); - switch (src.type()) - { - case CV_8UC1: - { - uchar nVal = cvRound(value[0]); - copyMakeBorder( src, dst, top, left, boardtype, &nVal); - break; - } - case CV_8UC4: - { - uchar nVal[] = {(uchar)value[0], (uchar)value[1], (uchar)value[2], (uchar)value[3]}; - copyMakeBorder( src, dst, top, left, boardtype, nVal); - break; - } - case CV_32SC1: - { - int nVal = cvRound(value[0]); - copyMakeBorder( src, dst, top, left, boardtype, &nVal); - break; - } - default: - CV_Error(CV_StsUnsupportedFormat, "Unsupported source type"); - } + openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1,compile_option); + //uchar* cputemp=new uchar[32*dst.wholerows]; + ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; + //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE, + // 0, 32*dst.wholerows, cputemp, 0, NULL, NULL)); + //for(int i=0;iimpl->clContext, CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st ); - openCLVerifyCall(st); - openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0)); + if(src.clCxt -> impl -> double_support != 0) + { + cl_int st; + coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st ); + openCLVerifyCall(st); + openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0)); + }else{ + cl_int st; + for(int m=0;m<2;m++) + for(int n=0;n<3;n++) + { + float_coeffs[m][n]=coeffs[m][n]; + } + coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st ); + openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0)); + + } //TODO: improve this kernel size_t blkSizeX = 16, blkSizeY = 16; size_t glbSizeX; @@ -853,31 +1000,46 @@ namespace cv void warpPerspective_gpu(const oclMat &src, oclMat &dst, double coeffs[3][3], int interpolation) { - CV_Assert( (src.channels() == dst.channels()) ); + CV_Assert( (src.channels() == dst.channels()) ); int srcStep = src.step1(); int dstStep = dst.step1(); + float float_coeffs[3][3]; + cl_mem coeffs_cm; Context *clCxt = src.clCxt; string s[3] = {"NN", "Linear", "Cubic"}; string kernelName = "warpPerspective" + s[interpolation]; - cl_int st; - cl_mem coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st ); - openCLVerifyCall(st); - openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0)); - + if(src.clCxt -> impl -> double_support != 0) + { + cl_int st; + coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st ); + openCLVerifyCall(st); + openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0)); + }else{ + cl_int st; + for(int m=0;m<3;m++) + for(int n=0;n<3;n++) + float_coeffs[m][n]=coeffs[m][n]; + + coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st ); + openCLVerifyCall(st); + openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0)); + } //TODO: improve this kernel size_t blkSizeX = 16, blkSizeY = 16; size_t glbSizeX; + size_t cols; if(src.type() == CV_8UC1 && interpolation == 0) { - size_t cols = (dst.cols + dst.offset % 4 + 3) / 4; + cols = (dst.cols + dst.offset % 4 + 3) / 4; glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; } else /* */ { + cols = dst.cols; glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX; } size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY; @@ -897,6 +1059,7 @@ namespace cv args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset)); args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm)); + args.push_back(make_pair(sizeof(cl_int), (void *)&cols)); openCLExecuteKernel(clCxt, &imgproc_warpPerspective, kernelName, globalThreads, localThreads, args, src.channels(), src.depth()); openCLSafeCall(clReleaseMemObject(coeffs_cm)); @@ -1027,7 +1190,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step)); size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_cols", gt, lt, args, -1, -1); + openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, -1); args.clear(); args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data )); @@ -1037,7 +1200,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset)); size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1}; - openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_rows", gt2, lt2, args, -1, -1); + openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, -1); //cout << "tested" << endl; } @@ -1047,37 +1210,26 @@ namespace cv { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1); double scale = static_cast(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize; - oclMat temp; if (ksize < 0) scale *= 2.; if (src.depth() == CV_8U){ - src.convertTo(temp, (int)CV_32FC1); scale *= 255.; scale = 1. / scale; - if (ksize > 0) - { - Sobel(temp, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); - Sobel(temp, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); - } - else - { - Scharr(temp, Dx, CV_32F, 1, 0, scale, 0, borderType); - Scharr(temp, Dy, CV_32F, 0, 1, scale, 0, borderType); - } }else{ scale = 1. / scale; - if (ksize > 0) - { - Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); - Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); - } - else - { - Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType); - Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType); - } } + if (ksize > 0) + { + Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); + Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + } + else + { + Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType); + Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType); + } + CV_Assert(Dx.offset == 0 && Dy.offset == 0); } void corner_ocl(const char *src_str, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy, @@ -1142,8 +1294,9 @@ namespace cv { CV_Error(CV_GpuNotSupported,"select device don't support double"); } + CV_Assert(src.cols >= blockSize/2 && src.rows >= blockSize/2); oclMat Dx, Dy; - CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT); + CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT); extractCovData(src, Dx, Dy, blockSize, ksize, borderType); dst.create(src.size(), CV_32F); corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast(k), Dx, Dy, dst, borderType); @@ -1155,8 +1308,9 @@ namespace cv { CV_Error(CV_GpuNotSupported,"select device don't support double"); } + CV_Assert(src.cols >= blockSize/2 && src.rows >= blockSize/2); oclMat Dx, Dy; - CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT); + CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT); extractCovData(src, Dx, Dy, blockSize, ksize, borderType); dst.create(src.size(), CV_32F); corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType); @@ -1204,6 +1358,11 @@ namespace cv if( src.depth() != CV_8U || src.channels() != 4 ) CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); + if(src.clCxt->impl->double_support == 0) + { + CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); + } + dst.create( src.size(), CV_8UC4 ); if( !(criteria.type & TermCriteria::MAX_ITER) ) @@ -1267,6 +1426,11 @@ namespace cv if( src.depth() != CV_8U || src.channels() != 4 ) CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); + if(src.clCxt->impl->double_support == 0) + { + CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); + } + dstr.create( src.size(), CV_8UC4 ); dstsp.create( src.size(), CV_16SC2 ); @@ -1313,15 +1477,25 @@ namespace cv int hist_step = mat_sub_hist.step >> 2; int left_col = 0, right_col = 0; - left_col = dataWidth - (src_offset & mask); - left_col &= mask; - src_offset += left_col; - cols -= left_col; - right_col = cols & mask; - cols -= right_col; + if(cols >= dataWidth*2 -1) + { + left_col = dataWidth - (src_offset & mask); + left_col &= mask; + src_offset += left_col; + cols -= left_col; + right_col = cols & mask; + cols -= right_col; + } + else + { + left_col = cols; + right_col = 0; + cols = 0; + globalThreads[0] = 0; + } vector > args; - if(cols > 0) + if(globalThreads[0] != 0) { int tempcols = cols >> dataWidth_bits; int inc_x = globalThreads[0] % tempcols; @@ -1412,89 +1586,93 @@ namespace cv LUT(mat_src, lut, mat_dst); } //////////////////////////////////bilateralFilter//////////////////////////////////////////////////// +static void +oclbilateralFilter_8u( const oclMat& src, oclMat& dst, int d, + double sigma_color, double sigma_space, + int borderType ) +{ + int cn = src.channels(); + int i, j, k, maxk, radius; + Size size = src.size(); + + CV_Assert( (src.type() == CV_8UC1 || src.download_channels == 3) && + src.type() == dst.type() && src.size() == dst.size() && + src.data != dst.data ); + + if( sigma_color <= 0 ) + sigma_color = 1; + if( sigma_space <= 0 ) + sigma_space = 1; + + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + + if( d <= 0 ) + radius = cvRound(sigma_space*1.5); + else + radius = d/2; + radius = MAX(radius, 1); + d = radius*2 + 1; + + oclMat temp; + copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); + + vector _color_weight(cn*256); + vector _space_weight(d*d); + vector _space_ofs(d*d); + float* color_weight = &_color_weight[0]; + float* space_weight = &_space_weight[0]; + int* space_ofs = &_space_ofs[0]; + + // initialize color-related bilateral filter coefficients + for( i = 0; i < 256*cn; i++ ) + color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + for( j = -radius; j <= radius; j++ ) + { + double r = std::sqrt((double)i*i + (double)j*j); + if( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); + space_ofs[maxk++] = (int)(i*temp.step + j*cn); + } + oclMat oclcolor_weight(1,cn*256,CV_32FC1,color_weight); + oclMat oclspace_weight(1,d*d,CV_32FC1,space_weight); + oclMat oclspace_ofs(1,d*d,CV_32SC1,space_ofs); + + string kernelName = "bilateral"; + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { (dst.cols+ localThreads[0]-1)/localThreads[0] * localThreads[0], + (dst.rows+ localThreads[1]-1)/localThreads[1]* localThreads[1], + 1}; + vector > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&temp.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&maxk )); + args.push_back( make_pair( sizeof(cl_int), (void *)&radius )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); + args.push_back( make_pair( sizeof(cl_int), (void *)&temp.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&temp.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&temp.cols )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data )); + openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, -1, -1); +} void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType) { - double sigmacolor = -0.5 / (sigmaclr * sigmaclr); - double sigmaspace = -0.5 / (sigmaspc * sigmaspc); - dst.create(src.size(), src.type()); - Context *clCxt = src.clCxt; - int r = radius; - int d = 2 * r + 1; - - oclMat tmp; - Scalar valu(0, 0, 0, 0); - copyMakeBorder(src, tmp, r, r, r, r, borderType, valu); - - tmp.offset = (src.offset / src.step + r) * tmp.step + (src.offset % src.step + r); - int src_offset = tmp.offset; - int channels = tmp.channels(); - int rows = src.rows;//in pixel - int cols = src.cols;//in pixel - //int step = tmp.step; - int src_step = tmp.step;//in Byte - int dst_step = dst.step;//in Byte - int whole_rows = tmp.wholerows;//in pixel - int whole_cols = tmp.wholecols;//in pixel - int dst_offset = dst.offset;//in Byte - - double rs; - size_t size_space = d * d * sizeof(float); - float *sigSpcH = (float *)malloc(size_space); - for(int i = -r; i <= r; i++) - { - for(int j = -r; j <= r; j++) - { - rs = std::sqrt(double(i * i) + (double)j * j); - - sigSpcH[(i+r)*d+j+r] = rs > r ? 0 : (float)std::exp(rs * rs * sigmaspace); - } - } - - size_t size_color = 256 * channels * sizeof(float); - float *sigClrH = (float *)malloc(size_color); - for(int i = 0; i < 256 * channels; i++) - { - sigClrH[i] = (float)std::exp(i * i * sigmacolor); - } - string kernelName; - if(1 == channels) kernelName = "bilateral"; - if(4 == channels) kernelName = "bilateral4"; - - cl_int errcode_ret; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &imgproc_bilateral, kernelName); - - CV_Assert(src.channels() == dst.channels()); - - cl_mem sigClr = clCreateBuffer(clCxt->impl->clContext, CL_MEM_USE_HOST_PTR, size_color, sigClrH, &errcode_ret); - cl_mem sigSpc = clCreateBuffer(clCxt->impl->clContext, CL_MEM_USE_HOST_PTR, size_space, sigSpcH, &errcode_ret); - if(errcode_ret != CL_SUCCESS) printf("create buffer error\n"); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(void *), (void *)&dst.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(void *), (void *)&tmp.data)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(rows), (void *)&rows)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cols), (void *)&cols)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(channels), (void *)&channels)); - openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(radius), (void *)&radius)); - openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(whole_rows), (void *)&whole_rows)); - openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(whole_cols), (void *)&whole_cols)); - openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(src_step), (void *)&src_step)); - openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(dst_step), (void *)&dst_step)); - openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(src_offset), (void *)&src_offset)); - openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(dst_offset), (void *)&dst_offset)); - openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&sigClr)); - openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_mem), (void *)&sigSpc)); - - openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, sigClr, CL_TRUE, 0, size_color, sigClrH, 0, NULL, NULL)); - openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, sigSpc, CL_TRUE, 0, size_space, sigSpcH, 0, NULL, NULL)); - - size_t localSize[] = {16, 16}; - size_t globalSize[] = {(cols / 16 + 1) * 16, (rows / 16 + 1) * 16}; - openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL)); - - clFinish(clCxt->impl->clCmdQueue); - openCLSafeCall(clReleaseKernel(kernel)); - free(sigClrH); - free(sigSpcH); + dst.create( src.size(), src.type() ); + if( src.depth() == CV_8U ) + oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType ); + else + CV_Error( CV_StsUnsupportedFormat, + "Bilateral filtering is only implemented for 8uimages" ); } } diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index 008633b..6c3f94b 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -653,7 +653,7 @@ namespace cv #endif void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], vector< pair > &args, int channels, - int depth, char *build_options) + int depth, const char *build_options) { //construct kernel name //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number @@ -727,7 +727,7 @@ namespace cv } void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth, char *build_options) + vector< pair > &args, int channels, int depth, const char *build_options) { #ifndef PRINT_KERNEL_RUN_TIME diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/kernels/arithm_addWeighted.cl index 4340100..cffcb9a 100644 --- a/modules/ocl/src/kernels/arithm_addWeighted.cl +++ b/modules/ocl/src/kernels/arithm_addWeighted.cl @@ -74,8 +74,17 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + uchar4 src1_data ,src2_data; + + src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; + src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; + src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; + src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; + + src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; + src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; + src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; + src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; diff --git a/modules/ocl/src/kernels/arithm_div.cl b/modules/ocl/src/kernels/arithm_div.cl index ae4f46a..4a2e4a4 100644 --- a/modules/ocl/src/kernels/arithm_div.cl +++ b/modules/ocl/src/kernels/arithm_div.cl @@ -48,12 +48,12 @@ typedef double F ; typedef double4 F4; #define convert_F4 convert_double4 -#define convert_F convert_double +#define convert_F double #else typedef float F; typedef float4 F4; #define convert_F4 convert_float4 -#define convert_F convert_float +#define convert_F float #endif uchar round2_uchar(F v){ @@ -229,7 +229,7 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index)); - F tmp = convert_F(data1) * scalar; + F tmp = (convert_F)(data1) * scalar; int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2)); *((__global int *)((__global char *)dst + dst_index)) =tmp_data; @@ -253,7 +253,7 @@ __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offse float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index)); - F tmp = convert_F(data1) * scalar; + F tmp = (convert_F)(data1) * scalar; float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2)); *((__global float *)((__global char *)dst + dst_index)) = tmp_data; diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc.cl b/modules/ocl/src/kernels/arithm_minMaxLoc.cl index 6937630..ecdaedf 100644 --- a/modules/ocl/src/kernels/arithm_minMaxLoc.cl +++ b/modules/ocl/src/kernels/arithm_minMaxLoc.cl @@ -46,65 +46,65 @@ /**************************************PUBLICFUNC*************************************/ #if defined (DOUBLE_SUPPORT) #pragma OPENCL EXTENSION cl_khr_fp64:enable -#define RES_TYPE double8 -#define CONVERT_RES_TYPE convert_double8 +#define RES_TYPE double4 +#define CONVERT_RES_TYPE convert_double4 #else -#define RES_TYPE float8 -#define CONVERT_RES_TYPE convert_float8 +#define RES_TYPE float4 +#define CONVERT_RES_TYPE convert_float4 #endif #if defined (DEPTH_0) -#define VEC_TYPE uchar8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_uchar8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE uchar4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_uchar4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL 0 #define MAX_VAL 255 #endif #if defined (DEPTH_1) -#define VEC_TYPE char8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_char8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE char4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_char4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL -128 #define MAX_VAL 127 #endif #if defined (DEPTH_2) -#define VEC_TYPE ushort8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_ushort8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE ushort4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_ushort4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL 0 #define MAX_VAL 65535 #endif #if defined (DEPTH_3) -#define VEC_TYPE short8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_short8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE short4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_short4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL -32768 #define MAX_VAL 32767 #endif #if defined (DEPTH_4) -#define VEC_TYPE int8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_int8 +#define VEC_TYPE int4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_int4 #define CONDITION_FUNC(a,b,c) ((a) ? b : c) #define MIN_VAL INT_MIN #define MAX_VAL INT_MAX #endif #if defined (DEPTH_5) -#define VEC_TYPE float8 -#define VEC_TYPE_LOC float8 -#define CONVERT_TYPE convert_float8 +#define VEC_TYPE float4 +#define VEC_TYPE_LOC float4 +#define CONVERT_TYPE convert_float4 #define CONDITION_FUNC(a,b,c) ((a) ? b : c) #define MIN_VAL (-FLT_MAX) #define MAX_VAL FLT_MAX #endif #if defined (DEPTH_6) -#define VEC_TYPE double8 -#define VEC_TYPE_LOC double8 -#define CONVERT_TYPE convert_double8 +#define VEC_TYPE double4 +#define VEC_TYPE_LOC double4 +#define CONVERT_TYPE convert_double4 #define CONDITION_FUNC(a,b,c) ((a) ? b : c) #define MIN_VAL (-DBL_MAX) #define MAX_VAL DBL_MAX @@ -122,44 +122,22 @@ #if defined (REPEAT_S3) #define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3; #endif -#if defined (REPEAT_S4) -#define repeat_s(a) a.s0 = a.s4;a.s1 = a.s4;a.s2 = a.s4;a.s3 = a.s4; -#endif -#if defined (REPEAT_S5) -#define repeat_s(a) a.s0 = a.s5;a.s1 = a.s5;a.s2 = a.s5;a.s3 = a.s5;a.s4 = a.s5; -#endif -#if defined (REPEAT_S6) -#define repeat_s(a) a.s0 = a.s6;a.s1 = a.s6;a.s2 = a.s6;a.s3 = a.s6;a.s4 = a.s6;a.s5 = a.s6; -#endif -#if defined (REPEAT_S7) -#define repeat_s(a) a.s0 = a.s7;a.s1 = a.s7;a.s2 = a.s7;a.s3 = a.s7;a.s4 = a.s7;a.s5 = a.s7;a.s6 = a.s7; -#endif + #if defined (REPEAT_E0) #define repeat_e(a) a=a; #endif #if defined (REPEAT_E1) -#define repeat_e(a) a.s7 = a.s6; +#define repeat_e(a) a.s3 = a.s2; #endif #if defined (REPEAT_E2) -#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5; +#define repeat_e(a) a.s3 = a.s1;a.s2 = a.s1; #endif #if defined (REPEAT_E3) -#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4; -#endif -#if defined (REPEAT_E4) -#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3; -#endif -#if defined (REPEAT_E5) -#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2; -#endif -#if defined (REPEAT_E6) -#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1; -#endif -#if defined (REPEAT_E7) -#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; +#define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; #endif + #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable @@ -179,8 +157,8 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem if(id < elemnum) { temp = src[idx]; - idx_c = idx << 3; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7); + idx_c = idx << 2; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); if(id % cols == 0 ) { repeat_s(temp); @@ -203,13 +181,13 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem minloc = negative; maxloc = negative; } - float8 aaa; + float4 aaa; for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) { idx = offset + id + (id / cols) * invalid_cols; temp = src[idx]; - idx_c = idx << 3; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7); + idx_c = idx << 2; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); if(id % cols == 0 ) { repeat_s(temp); @@ -224,8 +202,8 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem maxval = max(maxval,temp); minloc = CONDITION_FUNC(minval == temp, temploc , minloc); maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc); - aaa= convert_float8(maxval == temp); - maxloc = convert_int8(aaa) ? temploc : maxloc; + aaa= convert_float4(maxval == temp); + maxloc = convert_int4(aaa) ? temploc : maxloc; } if(lid > 127) { @@ -278,47 +256,25 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem #if defined (REPEAT_S3) #define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0; #endif -#if defined (REPEAT_S4) -#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0; -#endif -#if defined (REPEAT_S5) -#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0; -#endif -#if defined (REPEAT_S6) -#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0; -#endif -#if defined (REPEAT_S7) -#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0; -#endif #if defined (REPEAT_E0) #define repeat_me(a) a = a; #endif #if defined (REPEAT_E1) -#define repeat_me(a) a.s7 = 0; +#define repeat_me(a) a.s3 = 0; #endif #if defined (REPEAT_E2) -#define repeat_me(a) a.s7 = 0;a.s6 = 0; +#define repeat_me(a) a.s3 = 0;a.s2 = 0; #endif #if defined (REPEAT_E3) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0; -#endif -#if defined (REPEAT_E4) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0; -#endif -#if defined (REPEAT_E5) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0; -#endif -#if defined (REPEAT_E6) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0; -#endif -#if defined (REPEAT_E7) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0; +#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0; #endif + /**************************************Array minMaxLoc mask**************************************/ +/* __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global VEC_TYPE *src, - int minvalid_cols,int moffset,__global uchar8 *mask,__global RES_TYPE *dst) + int minvalid_cols,int moffset,__global uchar4 *mask,__global RES_TYPE *dst) { unsigned int lid = get_local_id(0); unsigned int gid = get_group_id(0); @@ -333,8 +289,8 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int { temp = src[idx]; m_temp = CONVERT_TYPE(mask[midx]); - int idx_c = idx << 3; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7); + int idx_c = idx << 2; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); if(id % cols == 0 ) { repeat_ms(m_temp); @@ -363,8 +319,8 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int midx = moffset + id + (id / cols) * minvalid_cols; temp = src[idx]; m_temp = CONVERT_TYPE(mask[midx]); - int idx_c = idx << 3; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7); + int idx_c = idx << 2; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); if(id % cols == 0 ) { repeat_ms(m_temp); @@ -421,3 +377,4 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int } } +*/ \ No newline at end of file diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl b/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl index 21cd4c1..0b9f796 100644 --- a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl +++ b/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl @@ -46,125 +46,101 @@ /**************************************PUBLICFUNC*************************************/ #if defined (DOUBLE_SUPPORT) #pragma OPENCL EXTENSION cl_khr_fp64:enable -#define RES_TYPE double8 -#define CONVERT_RES_TYPE convert_double8 +#define RES_TYPE double4 +#define CONVERT_RES_TYPE convert_double4 #else -#define RES_TYPE float8 -#define CONVERT_RES_TYPE convert_float8 +#define RES_TYPE float4 +#define CONVERT_RES_TYPE convert_float4 #endif #if defined (DEPTH_0) #define TYPE uchar -#define VEC_TYPE uchar8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_uchar8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE uchar4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_uchar4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL 0 #define MAX_VAL 255 #endif #if defined (DEPTH_1) #define TYPE char -#define VEC_TYPE char8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_char8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE char4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_char4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL -128 #define MAX_VAL 127 #endif #if defined (DEPTH_2) #define TYPE ushort -#define VEC_TYPE ushort8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_ushort8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE ushort4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_ushort4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL 0 #define MAX_VAL 65535 #endif #if defined (DEPTH_3) #define TYPE short -#define VEC_TYPE short8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_short8 -#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c) +#define VEC_TYPE short4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_short4 +#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c) #define MIN_VAL -32768 #define MAX_VAL 32767 #endif #if defined (DEPTH_4) #define TYPE int -#define VEC_TYPE int8 -#define VEC_TYPE_LOC int8 -#define CONVERT_TYPE convert_int8 +#define VEC_TYPE int4 +#define VEC_TYPE_LOC int4 +#define CONVERT_TYPE convert_int4 #define CONDITION_FUNC(a,b,c) ((a) ? b : c) #define MIN_VAL INT_MIN #define MAX_VAL INT_MAX #endif #if defined (DEPTH_5) #define TYPE float -#define VEC_TYPE float8 -#define VEC_TYPE_LOC float8 -#define CONVERT_TYPE convert_float8 +#define VEC_TYPE float4 +#define VEC_TYPE_LOC float4 +#define CONVERT_TYPE convert_float4 #define CONDITION_FUNC(a,b,c) ((a) ? b : c) #define MIN_VAL (-FLT_MAX) #define MAX_VAL FLT_MAX #endif #if defined (DEPTH_6) #define TYPE double -#define VEC_TYPE double8 -#define VEC_TYPE_LOC double8 -#define CONVERT_TYPE convert_double8 +#define VEC_TYPE double4 +#define VEC_TYPE_LOC double4 +#define CONVERT_TYPE convert_double4 #define CONDITION_FUNC(a,b,c) ((a) ? b : c) #define MIN_VAL (-DBL_MAX) #define MAX_VAL DBL_MAX #endif #if defined (REPEAT_E0) -#define repeat_e(a) a = a; +#define repeat_e(a) a=a; #endif #if defined (REPEAT_E1) -#define repeat_e(a) a.s7 = a.s6; +#define repeat_e(a) a.s3 = a.s2; #endif #if defined (REPEAT_E2) -#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5; +#define repeat_e(a) a.s3 = a.s1;a.s2 = a.s1; #endif #if defined (REPEAT_E3) -#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4; -#endif -#if defined (REPEAT_E4) -#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3; -#endif -#if defined (REPEAT_E5) -#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2; -#endif -#if defined (REPEAT_E6) -#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1; -#endif -#if defined (REPEAT_E7) -#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; +#define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0; #endif #if defined (REPEAT_E0) #define repeat_me(a) a = a; #endif #if defined (REPEAT_E1) -#define repeat_me(a) a.s7 = 0; +#define repeat_me(a) a.s3 = 0; #endif #if defined (REPEAT_E2) -#define repeat_me(a) a.s7 = 0;a.s6 = 0; +#define repeat_me(a) a.s3 = 0;a.s2 = 0; #endif #if defined (REPEAT_E3) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0; -#endif -#if defined (REPEAT_E4) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0; -#endif -#if defined (REPEAT_E5) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0; -#endif -#if defined (REPEAT_E6) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0; -#endif -#if defined (REPEAT_E7) -#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0; +#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0; #endif /**************************************Array minMaxLoc mask**************************************/ @@ -182,10 +158,10 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0; if(id < elemnum) { - temp = vload8(idx, &src[offset]); - m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset])); - int idx_c = (idx << 3) + offset; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7); + temp = vload4(idx, &src[offset]); + m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); + int idx_c = (idx << 2) + offset; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); if(id % cols == cols - 1) { repeat_me(m_temp); @@ -207,10 +183,10 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int { idx = id + (id / cols) * invalid_cols; midx = id + (id / cols) * minvalid_cols; - temp = vload8(idx, &src[offset]); - m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset])); - int idx_c = (idx << 3) + offset; - temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7); + temp = vload4(idx, &src[offset]); + m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset])); + int idx_c = (idx << 2) + offset; + temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3); if(id % cols == cols - 1) { repeat_me(m_temp); diff --git a/modules/ocl/src/kernels/arithm_mul.cl b/modules/ocl/src/kernels/arithm_mul.cl index 4465651..be25cf2 100644 --- a/modules/ocl/src/kernels/arithm_mul.cl +++ b/modules/ocl/src/kernels/arithm_mul.cl @@ -92,8 +92,17 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); + uchar4 src1_data ,src2_data; + + src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; + src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; + src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; + src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; + + src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; + src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; + src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; + src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); int4 tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data); diff --git a/modules/ocl/src/kernels/arithm_pow.cl b/modules/ocl/src/kernels/arithm_pow.cl index a86a4fa..a3b81e0 100644 --- a/modules/ocl/src/kernels/arithm_pow.cl +++ b/modules/ocl/src/kernels/arithm_pow.cl @@ -45,13 +45,19 @@ #if defined (DOUBLE_SUPPORT) #pragma OPENCL EXTENSION cl_khr_fp64:enable +typedef double F; +typedef double4 F4; +#define convert_F4 convert_double4; +#else +typedef float F; +typedef float4 F4; +#define convert_F4 convert_float4; #endif /************************************** pow **************************************/ -#if defined (DOUBLE_SUPPORT) __kernel void arithm_pow_D5 (__global float *src1, int src1_step, int src1_offset, __global float *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1, - double p) + F p) { int x = get_global_id(0); @@ -69,14 +75,12 @@ __kernel void arithm_pow_D5 (__global float *src1, int src1_step, int src1_offse } } -#endif - #if defined (DOUBLE_SUPPORT) __kernel void arithm_pow_D6 (__global double *src1, int src1_step, int src1_offset, __global double *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1, - double p) + F p) { int x = get_global_id(0); @@ -94,4 +98,3 @@ __kernel void arithm_pow_D6 (__global double *src1, int src1_step, int src1_offs } #endif - diff --git a/modules/ocl/src/kernels/convertC3C4.cl b/modules/ocl/src/kernels/convertC3C4.cl index 24b5312..780aefc 100644 --- a/modules/ocl/src/kernels/convertC3C4.cl +++ b/modules/ocl/src/kernels/convertC3C4.cl @@ -123,7 +123,6 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY int4 outaddr = mul24(id>>2 , 3); outaddr.y++; outaddr.z+=2; - //printf("%d ",outaddr.z); if(outaddr.z <= pixel_end) { dst[outaddr.x] = pixel0; diff --git a/modules/ocl/src/kernels/filtering_boxFilter.cl b/modules/ocl/src/kernels/filtering_boxFilter.cl index d523dda..1d6770d 100644 --- a/modules/ocl/src/kernels/filtering_boxFilter.cl +++ b/modules/ocl/src/kernels/filtering_boxFilter.cl @@ -238,7 +238,9 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; + //int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; + + int end_addr = src_whole_cols-4; uint4 data[ksY+1]; __local uint4 temp[2][THREADS]; #ifdef BORDER_CONSTANT @@ -247,8 +249,13 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); - ss = convert_uint4(src[cur_addr]); + + //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); + //ss = convert_uint4(src[cur_addr]); + + int cur_col = clamp(startX + col, 0, src_whole_cols); + ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); + data[i] = con ? ss : 0; } #else @@ -327,8 +334,12 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); - ss = src[cur_addr]; + // int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); + // ss = src[cur_addr]; + + int cur_col = clamp(startX + col, 0, src_whole_cols); + ss = src[(startY+i)*(src_step>>2) + cur_col]; + data[i] = con ? ss : 0.f; } #else @@ -407,8 +418,12 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr); - ss = src[cur_addr]; + //int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr); + //ss = src[cur_addr]; + + int cur_col = clamp(startX + col, 0, src_whole_cols); + ss = src[(startY+i)*(src_step>>4) + cur_col]; + data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); } #else diff --git a/modules/ocl/src/kernels/img_proc.cl b/modules/ocl/src/kernels/img_proc.cl deleted file mode 100644 index 5d2a70b..0000000 --- a/modules/ocl/src/kernels/img_proc.cl +++ /dev/null @@ -1,1331 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Shengen Yan,yanshengen@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -//wrapAffine kernel -//support four data types: CV_8U, CV_16U, CV_32S, CV_32F, and three interpolation methods: NN, Linear, Cubic. - -#define INTER_BITS 5 -#define INTER_TAB_SIZE (1 << INTER_BITS) -#define AB_BITS max(10, (int)INTER_BITS) -#define AB_SCALE (1 << AB_BITS) -#define INTER_REMAP_COEF_BITS 15 -#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) - -//this round operation is to approximate CPU's saturate_cast -int round2_int(double v) -{ - int v1=(int)v; - if(((v-v1)==0.5 || (v1-v)==0.5) && (v1%2)==0) - return v1; - else - return convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); -} - -inline void interpolateCubic( float x, float* coeffs ) -{ - const float A = -0.75f; - - coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A; - coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1; - coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1; - coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; -} - -__kernel void warpAffine_8u_NN(__global uchar * src, __global uchar * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx = (short)(X0 >> AB_BITS); - short sy = (short)(Y0 >> AB_BITS); - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpAffine_8u_Linear(__global uchar * src, __global uchar * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - int v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - short itab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - float v = tab1y[i] * tab1x[j]; - itab[i*2+j] = convert_short_sat(round2_int( v * INTER_REMAP_COEF_SCALE )); - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - int sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * itab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_uchar_sat ( ((int)sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -__kernel void warpAffine_8u_Cubic(__global uchar * src, __global uchar * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - short itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - int isum = 0; - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - double v = tab1y[i] * tab1x[j]; - isum += itab[i*4+j] = convert_short_sat( round2_int( v * INTER_REMAP_COEF_SCALE ) ); - } - } - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2, ksize = 4; - int diff = isum - INTER_REMAP_COEF_SCALE; - int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; - for( k1 = ksize2; k1 < ksize2+2; k1++ ) - for( k2 = ksize2; k2 < ksize2+2; k2++ ) - { - if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - if( diff < 0 ) - itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); - else - itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); - } - - if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - int sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * itab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_uchar_sat( (int)(sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -__kernel void warpAffine_16u_NN(__global ushort * src, __global ushort * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx = (short)(X0 >> AB_BITS); - short sy = (short)(Y0 >> AB_BITS); - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpAffine_16u_Linear(__global ushort * src, __global ushort * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - ushort v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - tab[i*2+j] = tab1y[i] * tab1x[j]; - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_ushort_sat( round2_int(sum) ) ; - } - } -} - -__kernel void warpAffine_16u_Cubic(__global ushort * src, __global ushort * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - ushort v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - tab[i*4+j] = tab1y[i] * tab1x[j]; - } - } - - int width = cols-3>0 ? cols-3 : 0; - int height = rows-3>0 ? rows-3 : 0; - if((unsigned)sx < width && (unsigned)sy < height ) - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*4*cn+c] * tab[i*4] + v[i*4*cn+c+1]*tab[i*4+1] - +v[i*4*cn+c+2] * tab[i*4+2] + v[i*4*cn+c+3]*tab[i*4+3]; - } - dst[dy*dstStep+dx*cn+c] = convert_ushort_sat( round2_int(sum )); - } - } - else if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * tab[i]; - } - dst[dy*dstStep+dx*cn+c] = convert_ushort_sat( round2_int(sum )); - } - } -} - - -__kernel void warpAffine_32s_NN(__global int * src, __global int * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx = (short)(X0 >> AB_BITS); - short sy = (short)(Y0 >> AB_BITS); - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpAffine_32s_Linear(__global int * src, __global int * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - int v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - tab[i*2+j] = tab1y[i] * tab1x[j]; - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_int_sat( round2_int(sum) ) ; - } - } -} - -__kernel void warpAffine_32s_Cubic(__global int * src, __global int * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - int v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - tab[i*4+j] = tab1y[i] * tab1x[j]; - } - } - - if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_int_sat( round2_int(sum )); - } - } -} - - -__kernel void warpAffine_32f_NN(__global float * src, __global float * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx = (short)(X0 >> AB_BITS); - short sy = (short)(Y0 >> AB_BITS); - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpAffine_32f_Linear(__global float * src, __global float * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - tab[i*2+j] = tab1y[i] * tab1x[j]; - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = sum ; - } - } -} - -__kernel void warpAffine_32f_Cubic(__global float * src, __global float * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int X0 = round2_int(M[0] * dx * AB_SCALE); - int Y0 = round2_int(M[3] * dx * AB_SCALE); - X0 += round2_int((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += round2_int((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - tab[i*4+j] = tab1y[i] * tab1x[j]; - } - } - int width = cols-3>0 ? cols-3 : 0; - int height = rows-3>0 ? rows-3 : 0; - if((unsigned)sx < width && (unsigned)sy < height ) - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*4*cn+c] * tab[i*4] + v[i*4*cn+c+1]*tab[i*4+1] - +v[i*4*cn+c+2] * tab[i*4+2] + v[i*4*cn+c+3]*tab[i*4+3]; - } - dst[dy*dstStep+dx*cn+c] = sum; - } - } - else if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * tab[i]; - } - dst[dy*dstStep+dx*cn+c] = sum; - } - } -} - -__kernel void warpPerspective_8u_NN(__global uchar * src, __global uchar * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? 1./W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpPerspective_8u_Linear(__global uchar * src, __global uchar * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - short itab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - float v = tab1y[i] * tab1x[j]; - itab[i*2+j] = convert_short_sat(round2_int( v * INTER_REMAP_COEF_SCALE )); - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - int sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * itab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_uchar_sat ( round2_int(sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -__kernel void warpPerspective_8u_Cubic(__global uchar * src, __global uchar * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - short itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - int isum = 0; - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - double v = tab1y[i] * tab1x[j]; - isum += itab[i*4+j] = convert_short_sat( round2_int( v * INTER_REMAP_COEF_SCALE ) ); - } - } - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2, ksize = 4; - int diff = isum - INTER_REMAP_COEF_SCALE; - int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; - for( k1 = ksize2; k1 < ksize2+2; k1++ ) - for( k2 = ksize2; k2 < ksize2+2; k2++ ) - { - if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - if( diff < 0 ) - itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); - else - itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); - } - - if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - int sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * itab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_uchar_sat( round2_int(sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -__kernel void warpPerspective_16u_NN(__global ushort * src, __global ushort * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? 1./W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpPerspective_16u_Linear(__global ushort * src, __global ushort * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - ushort v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - tab[i*2+j] = tab1y[i] * tab1x[j]; - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_ushort_sat( round2_int(sum) ) ; - } - } -} - -__kernel void warpPerspective_16u_Cubic(__global ushort * src, __global ushort * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - ushort v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - tab[i*4+j] = tab1y[i] * tab1x[j]; - } - } - - int width = cols-3>0 ? cols-3 : 0; - int height = rows-3>0 ? rows-3 : 0; - if((unsigned)sx < width && (unsigned)sy < height ) - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*4*cn+c] * tab[i*4] + v[i*4*cn+c+1]*tab[i*4+1] - +v[i*4*cn+c+2] * tab[i*4+2] + v[i*4*cn+c+3]*tab[i*4+3]; - } - dst[dy*dstStep+dx*cn+c] = convert_ushort_sat( round2_int(sum )); - } - } - else if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * tab[i]; - } - dst[dy*dstStep+dx*cn+c] = convert_ushort_sat( round2_int(sum )); - } - } -} - - -__kernel void warpPerspective_32s_NN(__global int * src, __global int * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? 1./W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpPerspective_32s_Linear(__global int * src, __global int * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - int v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - tab[i*2+j] = tab1y[i] * tab1x[j]; - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_int_sat( round2_int(sum) ) ; - } - } -} - -__kernel void warpPerspective_32s_Cubic(__global int * src, __global int * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - int v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - tab[i*4+j] = tab1y[i] * tab1x[j]; - } - } - - if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = convert_int_sat( round2_int(sum )); - } - } -} - - -__kernel void warpPerspective_32f_NN(__global float * src, __global float * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? 1./W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - for(int c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = (sx >= 0 && sx < cols && sy >= 0 && sy < rows) ? src[sy*srcStep+sx*cn+c] : 0; -} - -__kernel void warpPerspective_32f_Linear(__global float * src, __global float * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v[16]; - int i, j, c; - - for(i=0; i<2; i++) - for(j=0; j<2; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - - for( i=0; i<2; i++ ) - { - for( j=0; j<2; j++) - { - tab[i*2+j] = tab1y[i] * tab1x[j]; - } - } - if( sx+1 < 0 || sx >= cols || sy+1 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*cn+c] * tab[i] ; - } - dst[dy*dstStep+dx*cn+c] = sum ; - } - } -} - -__kernel void warpPerspective_32f_Cubic(__global float * src, __global float * dst, int cols, int rows, int cn, - int srcStep, int dstStep, __global double * M, int interpolation) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - double X0 = M[0]*dx + M[1]*dy + M[2]; - double Y0 = M[3]*dx + M[4]*dy + M[5]; - double W = M[6]*dx + M[7]*dy + M[8]; - W = W ? INTER_TAB_SIZE/W : 0; - int X = round2_int(X0*W); - int Y = round2_int(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v[64]; - int i, j, c; - - for(i=0; i<4; i++) - for(j=0; j<4; j++) - for(c=0; c= 0 && sx+j < cols && sy+i >= 0 && sy+i < rows) ? src[(sy+i) * srcStep + (sx+j)*cn + c] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - for( i=0; i<4; i++ ) - { - for( j=0; j<4; j++) - { - tab[i*4+j] = tab1y[i] * tab1x[j]; - } - } - - int width = cols-3>0 ? cols-3 : 0; - int height = rows-3>0 ? rows-3 : 0; - if((unsigned)sx < width && (unsigned)sy < height ) - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i*4*cn+c] * tab[i*4] + v[i*4*cn+c+1]*tab[i*4+1] - +v[i*4*cn+c+2] * tab[i*4+2] + v[i*4*cn+c+3]*tab[i*4+3]; - } - dst[dy*dstStep+dx*cn+c] = sum; - } - } - else if( sx+4 < 0 || sx >= cols || sy+4 < 0 || sy >= rows) - { - for(c = 0; c < cn; c++) - dst[dy*dstStep+dx*cn+c] = 0; - } - else - { - float sum; - for(c = 0; c < cn; c++) - { - sum = 0; - for ( i =0; i<16; i++ ) - { - sum += v[i*cn+c] * tab[i]; - } - dst[dy*dstStep+dx*cn+c] = sum; - } - } -} -#endif diff --git a/modules/ocl/src/kernels/imgproc_bilateral.cl b/modules/ocl/src/kernels/imgproc_bilateral.cl index 978d677..5bb9379 100644 --- a/modules/ocl/src/kernels/imgproc_bilateral.cl +++ b/modules/ocl/src/kernels/imgproc_bilateral.cl @@ -108,71 +108,38 @@ void bilateral4(__global uchar4 *dst, dst[index_dst] = convert_uchar4_rte(pd); } -__kernel -void bilateral(__global uchar *dst, - __global uchar *src, - int rows, - int cols, - int channels, - int radius, - int wholerows, - int wholecols, - int src_step, - int dst_step, - int src_offset, - int dst_offset, - __constant float *sigClr, - __constant float *sigSpc) -{ - uint lidx = get_local_id(0); - uint lidy = get_local_id(1); - - uint gdx = get_global_id(0); - uint gdy = get_global_id(1); - - uint gidx = gdx >=cols?cols-1:gdx; - uint gidy = gdy >=rows?rows-1:gdy; - - uchar p,q,tmp; - - float pf = 0,pq = 0,wt = 0,pd = 0; - - int r =radius; - int ij = 0; - int ct = 0; - - uint index_src = src_offset + gidy*src_step + gidx; - uint index_dst = dst_offset + gidy*dst_step + gidx; - - p = src[index_src]; - - uint gx,gy; - uint src_index,dst_index; - - for(int ii = -r;ii mul24(radius,radius)) continue; - - gx = gidx + jj; - gy = gidy + ii; - - - src_index = src_offset + gy * src_step + gx; - q = src[src_index]; - - ct = abs(p-q); - wt =sigClr[ct]*sigSpc[(ii+radius)*(2*radius+1)+jj+radius]; - - pf += q*wt; - - pq += wt; - } + int src_addr = mad24(gidy+radius,src_step,gidx+radius); + int dst_addr = mad24(gidy,src_step,gidx+dst_offset); + float sum = 0, wsum = 0; + + int val0 = (int)src[src_addr]; + for(int k = 0; k < maxk; k++ ) + { + int val = (int)src[src_addr + space_ofs[k]]; + float w = space_weight[k]*color_weight[abs(val - val0)]; + sum += (float)(val)*w; + wsum += w; + } + dst[dst_addr] = convert_uchar_rtz(sum/wsum+0.5f); } - pd = pf/pq; - dst[index_dst] = convert_uchar_rte(pd); - } diff --git a/modules/ocl/src/kernels/imgproc_calcHarris.cl b/modules/ocl/src/kernels/imgproc_calcHarris.cl index 6bc85cc..35720c2 100644 --- a/modules/ocl/src/kernels/imgproc_calcHarris.cl +++ b/modules/ocl/src/kernels/imgproc_calcHarris.cl @@ -65,8 +65,8 @@ #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) #endif -#ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#ifdef BORDER_REFLECT101 +//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) #define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) @@ -95,6 +95,8 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl int col = get_local_id(0); const int gX = get_group_id(0); const int gY = get_group_id(1); + const int glx = get_global_id(0); + const int gly = get_global_id(1); int dx_x_off = (dx_offset % dx_step) >> 2; int dx_y_off = dx_offset / dx_step; @@ -118,10 +120,10 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl for(int i=0; i < ksY+1; i++) { dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; - dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+dx_col)]; + dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; dx_data[i] = dx_con ? dx_s : 0.0; dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; - dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+dy_col)]; + dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; dy_data[i] = dy_con ? dy_s : 0.0; data[0][i] = dx_data[i] * dx_data[i]; data[1][i] = dx_data[i] * dy_data[i]; @@ -144,7 +146,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols); dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col); - dy_data[i] = Dy[dx_selected_row * (dy_step>>2) + dy_selected_col]; + dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; data[0][i] = dx_data[i] * dx_data[i]; data[1][i] = dx_data[i] * dy_data[i]; @@ -176,7 +178,7 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl { col += anX; int posX = dst_startX - dst_x_off + col - anX; - int posY = (gY << 1); + int posY = (gly << 1); int till = (ksX + 1)%2; float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; for(int k=0; k<6; k++) diff --git a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl b/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl index 2d4b43f..a7e884f 100644 --- a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl +++ b/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl @@ -65,8 +65,8 @@ #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) #endif -#ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#ifdef BORDER_REFLECT101 +//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba #define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) #define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) #define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) @@ -95,6 +95,8 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, int col = get_local_id(0); const int gX = get_group_id(0); const int gY = get_group_id(1); + const int glx = get_global_id(0); + const int gly = get_global_id(1); int dx_x_off = (dx_offset % dx_step) >> 2; int dx_y_off = dx_offset / dx_step; @@ -118,10 +120,10 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, for(int i=0; i < ksY+1; i++) { dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; - dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+dx_col)]; + dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; dx_data[i] = dx_con ? dx_s : 0.0; dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; - dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+dy_col)]; + dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; dy_data[i] = dy_con ? dy_s : 0.0; data[0][i] = dx_data[i] * dx_data[i]; data[1][i] = dx_data[i] * dy_data[i]; @@ -144,7 +146,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols); dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col); - dy_data[i] = Dy[dx_selected_row * (dy_step>>2) + dy_selected_col]; + dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; data[0][i] = dx_data[i] * dx_data[i]; data[1][i] = dx_data[i] * dy_data[i]; @@ -176,7 +178,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, { col += anX; int posX = dst_startX - dst_x_off + col - anX; - int posY = (gY << 1); + int posY = (gly << 1); int till = (ksX + 1)%2; float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; for(int k=0; k<6; k++) diff --git a/modules/ocl/src/kernels/imgproc_canny.cl b/modules/ocl/src/kernels/imgproc_canny.cl index 59835c3..663c6da 100644 --- a/modules/ocl/src/kernels/imgproc_canny.cl +++ b/modules/ocl/src/kernels/imgproc_canny.cl @@ -43,7 +43,6 @@ // //M*/ -#pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable @@ -651,7 +650,7 @@ __kernel } __constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; -__constant c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; +__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; #define stack_size 512 __kernel diff --git a/modules/ocl/src/kernels/imgproc_copymakeboder.cl b/modules/ocl/src/kernels/imgproc_copymakeboder.cl index 9270d9a..4777f49 100644 --- a/modules/ocl/src/kernels/imgproc_copymakeboder.cl +++ b/modules/ocl/src/kernels/imgproc_copymakeboder.cl @@ -35,212 +35,166 @@ // -#define get(a,b,c) (( b >= top & b < srcRows+top & a >= left & a < srcCols+left )? c : 8) -__kernel void copyConstBorder_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, - int srcCols, int srcRows, int dstCols, int dstRows, - int top, int left, uchar nVal, int srcStep, int dstStep) -{ - int idx = get_global_id(0); - int tpr = (dstCols + 3 + (dstOffset&3))>>2; - int dx = ((idx%(tpr))<<2) - (dstOffset&3); - int dy = idx/(tpr); - - __global uchar4 * d=(__global uchar4 *)(dst + dstOffset + dy*dstStep + dx); - int start=srcOffset + (dy-top)*srcStep + (dx-left); - uchar8 s=*((__global uchar8 *)(src + ((start>>2)<<2) )); - uchar4 v; - - uchar sv[9]={s.s0,s.s1,s.s2,s.s3,s.s4,s.s5,s.s6,s.s7,nVal}; - - int det=start&3; - v.x=sv[get(dx,dy,det)]; - v.y=sv[get(dx+1,dy,det+1)]; - v.z=sv[get(dx+2,dy,det+2)]; - v.w=sv[get(dx+3,dy,det+3)]; - - if(dy=0 && dx=0 && dx+1=0 && dx+2=0 && dx+3= (r_edge) ? (elem1) : (elem2) +#endif -#define get(a,b,c,d) (( b >= top & b < srcRows+top & a >= left & a < srcCols+left )? c : d) -__kernel void copyConstBorder_C1_D4(__global int * src, __global int * dst, int srcOffset, int dstOffset, - int srcCols, int srcRows, int dstCols, int dstRows, - int top, int left, int nVal, int srcStep, int dstStep) -{ - int idx = get_global_id(0); - int tpr = (dstCols + 3)>>2; - int dx = (idx%(tpr))<<2; - int dy = idx/(tpr); - - __global int4 * d=(__global int4 *)(dst+dy*dstStep+dx); - int4 s=*((__global int4 *)(src + srcOffset + (dy-top)*srcStep + (dx-left) )); - int4 v; - - v.x=get(dx,dy,s.x,nVal); - v.y=get(dx+1,dy,s.y,nVal); - v.z=get(dx+2,dy,s.z,nVal); - v.w=get(dx+3,dy,s.w,nVal); - - if(dy= (r_edge) ? (r_edge)-1 : (addr) +#endif -#define get(a,b,c) ( a < srcCols+left ? b : c) -__kernel void copyReplicateBorder_C1_D4(__global int * src, __global int * dst, int srcOffset, int dstOffset, - int srcCols, int srcRows, int dstCols, int dstRows, - int top, int left, int nVal, int srcStep, int dstStep) -{ - int idx = get_global_id(0); - int tpr = (dstCols + 3)>>2; - int dx = (idx%(tpr))<<2; - int dy = idx/(tpr); +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i,l_edge,r_edge,addr) (i) < (l_edge) ? -(i)-1 : (addr) +#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr) +#endif - __global int4 * d=(__global int4 *)(dst + dstOffset + dy*dstStep + dx); - int c=clamp(dx-left,0,srcCols-1); - int4 s=*((__global int4 *)(src + srcOffset + clamp(dy-top,0,srcRows-1) * srcStep + c )); - int sa[4]={s.x,s.y,s.z,s.w}; - int4 v; - - v.x=get(dx,sa[max(0,(dx-left)-c)],sa[srcCols-1-c]); - v.y=get(dx+1,sa[max(0,(dx+1-left)-c)],sa[srcCols-1-c]); - v.z=get(dx+2,sa[max(0,(dx+2-left)-c)],sa[srcCols-1-c]); - v.w=get(dx+3,sa[max(0,(dx+3-left)-c)],sa[srcCols-1-c]); - - if(dy= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr) +#endif + +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i,l_edge,r_edge,addr) (i) < (l_edge) ? (i)+(r_edge) : (addr) +#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? (i)-(r_edge) : (addr) +#endif -__kernel void copyReplicateBorder_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, - int srcCols, int srcRows, int dstCols, int dstRows, - int top, int left, uchar nVal, int srcStep, int dstStep) +__kernel void copymakeborder + (__global const GENTYPE *src, + __global GENTYPE *dst, + const int dst_cols, + const int dst_rows, + const int src_cols, + const int src_rows, + const int src_step_in_pixel, + const int src_offset_in_pixel, + const int dst_step_in_pixel, + const int dst_offset_in_pixel, + const int top, + const int left, + const GENTYPE val + ) { - int idx = get_global_id(0); - int tpr = (dstCols + 3 + (dstOffset&3))>>2; - int dx = ((idx%(tpr))<<2) - (dstOffset&3); - int dy = idx/(tpr); - - __global uchar4 * d=(__global uchar4 *)(dst + dstOffset + dy*dstStep + dx); - int c=clamp(dx-left,0,srcCols-1); - int start= srcOffset + clamp(dy-top,0,srcRows-1) * srcStep + c; - uchar8 s=*((__global uchar8 *)(src + ((start>>2)<<2) )); - uchar4 v; - - uchar sa[8]={s.s0,s.s1,s.s2,s.s3,s.s4,s.s5,s.s6,s.s7}; - - int det=start&3; - v.x=get(dx,sa[max(0,(dx-left)-c)+det],sa[srcCols-1-c+det]); - v.y=get(dx+1,sa[max(0,(dx+1-left)-c)+det],sa[srcCols-1-c+det]); - v.z=get(dx+2,sa[max(0,(dx+2-left)-c)+det],sa[srcCols-1-c+det]); - v.w=get(dx+3,sa[max(0,(dx+3-left)-c)+det],sa[srcCols-1-c+det]); - - if(dy= 0) && (src_x < src_cols) && (src_y >= 0) && (src_y < src_rows); + if(con) { - uchar4 res = *d; - res.x = (dx>=0 && dx=0 && dx+1=0 && dx+2=0 && dx+3=size?(size<<1)-2:rx<<1) - rx; -__kernel void copyReflectBorder_C1_D4(__global int * src, __global int * dst, int srcOffset, int dstOffset, - int srcCols, int srcRows, int dstCols, int dstRows, - int top, int left, int nVal, int srcStep, int dstStep) -{ - int idx = get_global_id(0); - int tpr = (dstCols + 3)>>2; - int dx = (idx%(tpr))<<2; - int dy = idx/(tpr); - - __global int4 * d=(__global int4 *)(dst + dstOffset + dy*dstStep + dx); - uint4 id; - edge(dx-left,srcCols,id.x); - edge(dx-left+1,srcCols,id.x); - edge(dx-left+2,srcCols,id.x); - edge(dx-left+3,srcCols,id.x); - - - - int start=min(id.x,id.w); - int4 s=*((__global int4 *)(src + srcOffset + clamp(dy-top,0,srcRows-1) * srcStep + start)); - int sa[4]={s.x,s.y,s.z,s.w}; - - int4 v=(int4)(sa[(id.x-start)],sa[(id.y-start)],sa[(id.z-start)],sa[(id.w-start)]); - - - if(dy>2; - int dx = ((idx%(tpr))<<2) - (dstOffset&3); - int dy = idx/(tpr); - - __global uchar4 * d=(__global uchar4 *)(dst + dstOffset + dy*dstStep + dx); - uint4 id; - edge(dx-left,srcCols,id.x); - edge(dx-left+1,srcCols,id.x); - edge(dx-left+2,srcCols,id.x); - edge(dx-left+3,srcCols,id.x); - - int start=min(id.x,id.w) + srcOffset; - uchar8 s=*((__global uchar8 *)(src + clamp(dy-top,0,srcRows-1) * srcStep + ((start>>2)<<2) )); - uchar sa[8]={s.s0,s.s1,s.s2,s.s3,s.s4,s.s5,s.s6,s.s7}; - - int det=start&3; - uchar4 v=(uchar4)(sa[(id.x-start)+det],sa[(id.y-start)+det],sa[(id.z-start)+det],sa[(id.w-start)+det]); - - if(dy= 0) && (src_x+3 < src_cols) && (src_y >= 0) && (src_y < src_rows); + if(con) { - uchar4 res = *d; - res.x = (dx>=0 && dx=0 && dx+1=0 && dx+2=0 && dx+3=0))||(src_x < src_cols) && (src_x+3 >= src_cols)) && (src_y >= 0) && (src_y < src_rows)) + { + int4 addr; + uchar4 tmp; + addr.x = ((src_x < 0) || (src_x>= src_cols)) ? 0 : src_addr; + addr.y = ((src_x+1 < 0) || (src_x+1>= src_cols)) ? 0 : (src_addr+1); + addr.z = ((src_x+2 < 0) || (src_x+2>= src_cols)) ? 0 : (src_addr+2); + addr.w = ((src_x+3 < 0) || (src_x+3>= src_cols)) ? 0 : (src_addr+3); + tmp.x = src[addr.x]; + tmp.y = src[addr.y]; + tmp.z = src[addr.z]; + tmp.w = src[addr.w]; + tmp.x = (src_x >=0)&&(src_x < src_cols) ? tmp.x : val; + tmp.y = (src_x+1 >=0)&&(src_x +1 < src_cols) ? tmp.y : val; + tmp.z = (src_x+2 >=0)&&(src_x +2 < src_cols) ? tmp.z : val; + tmp.w = (src_x+3 >=0)&&(src_x +3 < src_cols) ? tmp.w : val; + *(__global uchar4*)(dst+dst_addr) = tmp; + } + else if((xleft_col) ? (gidx+cols) : gidx); + gidx = ((gidx>=left_col) ? (gidx+cols) : gidx); int src_index = src_offset + mad24(gidy, src_step, gidx); + barrier(CLK_LOCAL_MEM_FENCE); int p = (int)src[src_index]; + p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p; atomic_inc(subhist + p); barrier(CLK_LOCAL_MEM_FENCE); diff --git a/modules/ocl/src/kernels/imgproc_integral_sum.cl b/modules/ocl/src/kernels/imgproc_integral_sum.cl index 46c5263..519cd6f 100644 --- a/modules/ocl/src/kernels/imgproc_integral_sum.cl +++ b/modules/ocl/src/kernels/imgproc_integral_sum.cl @@ -56,7 +56,7 @@ #define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) -kernel void integral_cols(__global uchar4 *src,__global int *sum , +kernel void integral_sum_cols(__global uchar4 *src,__global int *sum , int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) { unsigned int lid = get_local_id(0); @@ -136,7 +136,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum , } -kernel void integral_rows(__global int4 *srcsum,__global int *sum , +kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum , int rows,int cols,int src_step,int sum_step, int sum_offset) { diff --git a/modules/ocl/src/kernels/imgproc_resize.cl b/modules/ocl/src/kernels/imgproc_resize.cl index abfbfbe..fa13b21 100644 --- a/modules/ocl/src/kernels/imgproc_resize.cl +++ b/modules/ocl/src/kernels/imgproc_resize.cl @@ -138,17 +138,14 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri val2 = mul24(U1 , sdata3) + mul24(U , sdata4); val = mul24((int4)V1 , val1) + mul24((int4)V , val2); - //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx); - //uchar4 dVal = *d; - //int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows); val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS); - //*d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal; pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); pos4.y++; pos4.z+=2; + pos4.w+=3; uchar4 uval = convert_uchar4_sat(val); - int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows); + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0); if(con) { *(__global uchar4*)(dst + pos4.x)=uval; @@ -167,6 +164,10 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri { dst[pos4.z]=uval.z; } + if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.w]=uval.w; + } } } @@ -325,8 +326,9 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); pos.y++; pos.z+=2; + pos.w+=3; - int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows); + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0); if(con) { *(__global uchar4*)(dst + pos.x)=val; @@ -345,6 +347,10 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, { dst[pos.z]=val.z; } + if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.w]=val.w; + } } } diff --git a/modules/ocl/src/kernels/imgproc_warpPerspective.cl b/modules/ocl/src/kernels/imgproc_warpPerspective.cl index b8835d0..af1ebeb 100644 --- a/modules/ocl/src/kernels/imgproc_warpPerspective.cl +++ b/modules/ocl/src/kernels/imgproc_warpPerspective.cl @@ -82,157 +82,166 @@ inline void interpolateCubic( float x, float* coeffs ) ***********************************************************************************************/ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - dx = (dx<<2) - (dst_offset&3); - - F4 DX = (F4)(dx, dx+1, dx+2, dx+3); - F4 X0 = M[0]*DX + M[1]*dy + M[2]; - F4 Y0 = M[3]*DX + M[4]*dy + M[5]; - F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0; - W = (W!=zero) ? one/W : zero; - short4 X = convert_short4(rint(X0*W)); - short4 Y = convert_short4(rint(Y0*W)); - int4 sx = convert_int4(X); - int4 sy = convert_int4(Y); - - int4 DXD = (int4)(dx, dx+1, dx+2, dx+3); - __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); - uchar4 dval = *d; - int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows; - int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; - int4 spos = src_offset + sy * srcStep + sx; - uchar4 sval; - sval.s0 = scon.s0 ? src[spos.s0] : 0; - sval.s1 = scon.s1 ? src[spos.s1] : 0; - sval.s2 = scon.s2 ? src[spos.s2] : 0; - sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon != 0) ? sval : dval; - *d = dval; + if( dx < threadCols && dy < dst_rows) + { + dx = (dx<<2) - (dst_offset&3); + + F4 DX = (F4)(dx, dx+1, dx+2, dx+3); + F4 X0 = M[0]*DX + M[1]*dy + M[2]; + F4 Y0 = M[3]*DX + M[4]*dy + M[5]; + F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0; + W = (W!=zero) ? one/W : zero; + short4 X = convert_short4(rint(X0*W)); + short4 Y = convert_short4(rint(Y0*W)); + int4 sx = convert_int4(X); + int4 sy = convert_int4(Y); + + int4 DXD = (int4)(dx, dx+1, dx+2, dx+3); + __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); + uchar4 dval = *d; + int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows; + int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; + int4 spos = src_offset + sy * srcStep + sx; + uchar4 sval; + sval.s0 = scon.s0 ? src[spos.s0] : 0; + sval.s1 = scon.s1 ? src[spos.s1] : 0; + sval.s2 = scon.s2 ? src[spos.s2] : 0; + sval.s3 = scon.s3 ? src[spos.s3] : 0; + dval = convert_uchar4(dcon != 0) ? sval : dval; + *d = dval; + } } __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M ) + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - int sx = (short)(X >> INTER_BITS); - int sy = (short)(Y >> INTER_BITS); - int ay = (short)(Y & (INTER_TAB_SIZE-1)); - int ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[4]; - int i; + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + int sx = (short)(X >> INTER_BITS); + int sy = (short)(Y >> INTER_BITS); + int ay = (short)(Y & (INTER_TAB_SIZE-1)); + int ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar v[4]; + int i; #pragma unroll 4 - for(i=0; i<4; i++) - v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; - - short itab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - + for(i=0; i<4; i++) + v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0; + + short itab[4]; + float tab1y[2], tab1x[2]; + tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; + tab1y[1] = 1.f/INTER_TAB_SIZE*ay; + tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; + tab1x[1] = 1.f/INTER_TAB_SIZE*ax; + #pragma unroll 4 - for(i=0; i<4; i++) - { - float v = tab1y[(i>>1)] * tab1x[(i&1)]; - itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE )); - } - if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int sum = 0; - for ( i =0; i<4; i++ ) + for(i=0; i<4; i++) + { + float v = tab1y[(i>>1)] * tab1x[(i&1)]; + itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE )); + } + if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows) { - sum += v[i] * itab[i] ; + int sum = 0; + for ( i =0; i<4; i++ ) + { + sum += v[i] * itab[i] ; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; } - dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; } } __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[16]; - int i, j; - -#pragma unroll 4 - for(i=0; i<4; i++) - for(j=0; j<4; j++) + if( dx < threadCols && dy < dst_rows) { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar v[16]; + int i, j; + +#pragma unroll 4 + for(i=0; i<4; i++) + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; + } - short itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; + short itab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - - int isum = 0; + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + + int isum = 0; #pragma unroll 16 - for( i=0; i<16; i++ ) - { - F v = tab1y[(i>>2)] * tab1x[(i&3)]; - isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); - } - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2; - int diff = isum - INTER_REMAP_COEF_SCALE; - int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) - for( k2 = 2; k2 < 4; k2++ ) - { - if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); - } + for( i=0; i<16; i++ ) + { + F v = tab1y[(i>>2)] * tab1x[(i&3)]; + isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); + } + if( isum != INTER_REMAP_COEF_SCALE ) + { + int k1, k2; + int diff = isum - INTER_REMAP_COEF_SCALE; + int Mk1=2, Mk2=2, mk1=2, mk2=2; + for( k1 = 2; k1 < 4; k1++ ) + for( k2 = 2; k2 < 4; k2++ ) + { + if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); + } - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int sum=0; - for ( i =0; i<16; i++ ) + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) { - sum += v[i] * itab[i] ; + int sum=0; + for ( i =0; i<16; i++ ) + { + sum += v[i] * itab[i] ; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; } - dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; } } @@ -241,149 +250,159 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M ) + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? 1./W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>2)+sy*(srcStep>>2)+sx] : (uchar4)0; + if( dx < threadCols && dy < dst_rows) + { + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? 1./W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + short sx = (short)X; + short sy = (short)Y; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>2)+sy*(srcStep>>2)+sx] : (uchar4)0; + } } __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M ) + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - src_offset = (src_offset>>2); - srcStep = (srcStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - - int4 v0, v1, v2, v3; + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + srcStep = (srcStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS); + short sy = (short)(Y >> INTER_BITS); + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + + int4 v0, v1, v2, v3; - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0; - int itab0, itab1, itab2, itab3; - float taby, tabx; - taby = 1.f/INTER_TAB_SIZE*ay; - tabx = 1.f/INTER_TAB_SIZE*ax; - - itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); - itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); - - int4 val; - val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; + int itab0, itab1, itab2, itab3; + float taby, tabx; + taby = 1.f/INTER_TAB_SIZE*ay; + tabx = 1.f/INTER_TAB_SIZE*ax; + + itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); + itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); + itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); + itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + int4 val; + val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; + } } __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M ) + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - src_offset = (src_offset>>2); - srcStep = (srcStep>>2); - dst_offset = (dst_offset>>2); - dstStep = (dstStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar4 v[16]; - int i,j; -#pragma unroll 4 - for(i=0; i<4; i++) - for(j=0; j<4; j++) + if( dx < threadCols && dy < dst_rows) { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } - int itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = INTER_SCALE * ay; - axx = INTER_SCALE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - int isum = 0; - + src_offset = (src_offset>>2); + srcStep = (srcStep>>2); + dst_offset = (dst_offset>>2); + dstStep = (dstStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + uchar4 v[16]; + int i,j; +#pragma unroll 4 + for(i=0; i<4; i++) + for(j=0; j<4; j++) + { + v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; + } + int itab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; + + ayy = INTER_SCALE * ay; + axx = INTER_SCALE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); + int isum = 0; + #pragma unroll 16 - for( i=0; i<16; i++ ) - { - float tmp; - tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; - itab[i] = rint(tmp); - isum += itab[i]; - } + for( i=0; i<16; i++ ) + { + float tmp; + tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; + itab[i] = rint(tmp); + isum += itab[i]; + } - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2; - int diff = isum - INTER_REMAP_COEF_SCALE; - int Mk1=2, Mk2=2, mk1=2, mk2=2; - - for( k1 = 2; k1 < 4; k1++ ) - for( k2 = 2; k2 < 4; k2++ ) - { - - if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; - } + if( isum != INTER_REMAP_COEF_SCALE ) + { + int k1, k2; + int diff = isum - INTER_REMAP_COEF_SCALE; + int Mk1=2, Mk2=2, mk1=2, mk2=2; - diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); - } + for( k1 = 2; k1 < 4; k1++ ) + for( k2 = 2; k2 < 4; k2++ ) + { + + if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + + diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); + } - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int4 sum=0; - for ( i =0; i<16; i++ ) + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) { - sum += convert_int4(v[i]) * itab[i]; + int4 sum=0; + for ( i =0; i<16; i++ ) + { + sum += convert_int4(v[i]) * itab[i]; + } + dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; } - dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; } } @@ -393,123 +412,132 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? 1./W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx=0 && sy>2)+sy*srcStep+sx] : 0; + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? 1./W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + short sx = (short)X; + short sy = (short)Y; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx=0 && sy>2)+sy*srcStep+sx] : 0; + } } __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - src_offset = (src_offset>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v0, v1, v2, v3; - - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; - - float tab[4]; - float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - taby[1] = 1.f/INTER_TAB_SIZE*ay; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tabx[1] = 1.f/INTER_TAB_SIZE*ax; - - tab[0] = taby[0] * tabx[0]; - tab[1] = taby[0] * tabx[1]; - tab[2] = taby[1] * tabx[0]; - tab[3] = taby[1] * tabx[1]; - - float sum = 0; - sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*dstStep+dx] = sum; + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS); + short sy = (short)(Y >> INTER_BITS); + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + float v0, v1, v2, v3; + + v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0; + v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0; + v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0; + v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0; + + float tab[4]; + float taby[2], tabx[2]; + taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; + taby[1] = 1.f/INTER_TAB_SIZE*ay; + tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; + tabx[1] = 1.f/INTER_TAB_SIZE*ax; + + tab[0] = taby[0] * tabx[0]; + tab[1] = taby[0] * tabx[1]; + tab[2] = taby[1] * tabx[0]; + tab[3] = taby[1] * tabx[1]; + + float sum = 0; + sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>2)+dy*dstStep+dx] = sum; + } } __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - src_offset = (src_offset>>2); - dst_offset = (dst_offset>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>2); + dst_offset = (dst_offset>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx = (short)(X >> INTER_BITS) - 1; + short sy = (short)(Y >> INTER_BITS) - 1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); - float v[16]; - int i; + float v[16]; + int i; - for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + for(i=0; i<16; i++) + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; + float tab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); #pragma unroll 4 - for( i=0; i<16; i++ ) - { - tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - float sum = 0; -#pragma unroll 4 - for ( i =0; i<16; i++ ) + for( i=0; i<16; i++ ) { - sum += v[i] * tab[i]; + tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; } - dst[dst_offset+dy*dstStep+dx] = sum; + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + float sum = 0; +#pragma unroll 4 + for ( i =0; i<16; i++ ) + { + sum += v[i] * tab[i]; + } + dst[dst_offset+dy*dstStep+dx] = sum; + } } } @@ -519,130 +547,139 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W =(W != 0.0)? 1./W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : 0; + if( dx < threadCols && dy < dst_rows) + { + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W =(W != 0.0)? 1./W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + short sx = (short)X; + short sy = (short)Y; + + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : 0; + } } __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M ) + int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - - src_offset = (src_offset>>4); - dst_offset = (dst_offset>>4); - srcStep = (srcStep>>2); - dstStep = (dstStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx0 = (short)(X >> INTER_BITS); - short sy0 = (short)(Y >> INTER_BITS); - short ay0 = (short)(Y & (INTER_TAB_SIZE-1)); - short ax0 = (short)(X & (INTER_TAB_SIZE-1)); - - - float4 v0, v1, v2, v3; - - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; - - float tab[4]; - float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; - taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; - tabx[1] = 1.f/INTER_TAB_SIZE*ax0; - tab[0] = taby[0] * tabx[0]; - tab[1] = taby[0] * tabx[1]; - tab[2] = taby[1] * tabx[0]; - tab[3] = taby[1] * tabx[1]; - - float4 sum = 0; - sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[dst_offset+dy*dstStep+dx] = sum; + if( dx < threadCols && dy < dst_rows) + { + src_offset = (src_offset>>4); + dst_offset = (dst_offset>>4); + srcStep = (srcStep>>2); + dstStep = (dstStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); + + short sx0 = (short)(X >> INTER_BITS); + short sy0 = (short)(Y >> INTER_BITS); + short ay0 = (short)(Y & (INTER_TAB_SIZE-1)); + short ax0 = (short)(X & (INTER_TAB_SIZE-1)); + + + float4 v0, v1, v2, v3; + + v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; + v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; + v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; + v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; + + float tab[4]; + float taby[2], tabx[2]; + taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; + taby[1] = 1.f/INTER_TAB_SIZE*ay0; + tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; + tabx[1] = 1.f/INTER_TAB_SIZE*ax0; + + tab[0] = taby[0] * tabx[0]; + tab[1] = taby[0] * tabx[1]; + tab[2] = taby[1] * tabx[0]; + tab[3] = taby[1] * tabx[1]; + + float4 sum = 0; + sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; + if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + dst[dst_offset+dy*dstStep+dx] = sum; + } } __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M ) + int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) { int dx = get_global_id(0); int dy = get_global_id(1); - src_offset = (src_offset>>4); - dst_offset = (dst_offset>>4); - srcStep = (srcStep>>2); - dstStep = (dstStep>>2); + if( dx < threadCols && dy < dst_rows ) + { + src_offset = (src_offset>>4); + dst_offset = (dst_offset>>4); + srcStep = (srcStep>>2); + dstStep = (dstStep>>2); + + F X0 = M[0]*dx + M[1]*dy + M[2]; + F Y0 = M[3]*dx + M[4]*dy + M[5]; + F W = M[6]*dx + M[7]*dy + M[8]; + W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; + int X = rint(X0*W); + int Y = rint(Y0*W); - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - - float4 v[16]; - int i; + short sx = (short)(X >> INTER_BITS)-1; + short sy = (short)(Y >> INTER_BITS)-1; + short ay = (short)(Y & (INTER_TAB_SIZE-1)); + short ax = (short)(X & (INTER_TAB_SIZE-1)); + + + float4 v[16]; + int i; - for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; + for(i=0; i<16; i++) + v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; + float tab[16]; + float tab1y[4], tab1x[4]; + float axx, ayy; - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); + ayy = 1.f/INTER_TAB_SIZE * ay; + axx = 1.f/INTER_TAB_SIZE * ax; + interpolateCubic(ayy, tab1y); + interpolateCubic(axx, tab1x); #pragma unroll 4 - for( i=0; i<16; i++ ) - { - tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - float4 sum = 0; -#pragma unroll 4 - for ( i =0; i<16; i++ ) + for( i=0; i<16; i++ ) { - sum += v[i] * tab[i]; + tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; } - dst[dst_offset+dy*dstStep+dx] = sum; + + if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) + { + float4 sum = 0; +#pragma unroll 4 + for ( i =0; i<16; i++ ) + { + sum += v[i] * tab[i]; + } + dst[dst_offset+dy*dstStep+dx] = sum; - } + } + } } diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 0603cc6..f52af24 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -499,7 +499,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern }val; switch(dst.depth()) { - case 0: + case CV_8U: val.uval.s[0] = saturate_cast(scalar.val[0]); val.uval.s[1] = saturate_cast(scalar.val[1]); val.uval.s[2] = saturate_cast(scalar.val[2]); @@ -518,7 +518,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); } break; - case 1: + case CV_8S: val.cval.s[0] = saturate_cast(scalar.val[0]); val.cval.s[1] = saturate_cast(scalar.val[1]); val.cval.s[2] = saturate_cast(scalar.val[2]); @@ -537,7 +537,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); } break; - case 2: + case CV_16U: val.usval.s[0] = saturate_cast(scalar.val[0]); val.usval.s[1] = saturate_cast(scalar.val[1]); val.usval.s[2] = saturate_cast(scalar.val[2]); @@ -556,7 +556,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); } break; - case 3: + case CV_16S: val.shval.s[0] = saturate_cast(scalar.val[0]); val.shval.s[1] = saturate_cast(scalar.val[1]); val.shval.s[2] = saturate_cast(scalar.val[2]); @@ -575,7 +575,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); } break; - case 4: + case CV_32S: val.ival.s[0] = saturate_cast(scalar.val[0]); val.ival.s[1] = saturate_cast(scalar.val[1]); val.ival.s[2] = saturate_cast(scalar.val[2]); @@ -601,7 +601,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); } break; - case 5: + case CV_32F: val.fval.s[0] = scalar.val[0]; val.fval.s[1] = scalar.val[1]; val.fval.s[2] = scalar.val[2]; @@ -620,7 +620,7 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern CV_Error(CV_StsUnsupportedFormat,"unsupported channels"); } break; - case 6: + case CV_64F: val.dval.s[0] = scalar.val[0]; val.dval.s[1] = scalar.val[1]; val.dval.s[2] = scalar.val[2]; diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp index 0bde1e7..c919420 100644 --- a/modules/ocl/src/precomp.hpp +++ b/modules/ocl/src/precomp.hpp @@ -112,14 +112,14 @@ namespace cv size_t *globalThreads, size_t *localThreads); void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, vector< std::pair > &args, int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); - void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, + void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth, char *build_options); + vector< pair > &args, int channels, int depth, const char *build_options); void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], vector< pair > &args, int channels, int depth); void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], vector< pair > &args, int channels, - int depth, char *build_options); + int depth, const char *build_options); cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value, const size_t size); diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp index df41672..61ea73a 100644 --- a/modules/ocl/src/split_merge.cpp +++ b/modules/ocl/src/split_merge.cpp @@ -197,19 +197,29 @@ namespace cv args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[1].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].offset)); - if(n >= 3) + + if(channels == 4) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); + + // if channel == 3, then the matrix will convert to channel =4 + //if(n == 3) + // args.push_back( make_pair( sizeof(cl_int), (void *)&offset_cols)); + if(n == 3) - args.push_back( make_pair( sizeof(cl_int), (void *)&offset_cols)); - } - if(n >= 4) - { - args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[3].data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].offset)); + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); + } + else if( n== 4) + { + args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[3].data)); + args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].offset)); + } } args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.rows)); @@ -268,9 +278,9 @@ namespace cv int cols = divUp(mat_src.cols, index); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(mat_src.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(mat_src.rows, localThreads[1]) * localThreads[1], + 1 + }; vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data)); diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp index 2fa8d26..f8c0f0b 100644 --- a/modules/ocl/test/main.cpp +++ b/modules/ocl/test/main.cpp @@ -88,7 +88,7 @@ int main(int argc, char **argv) std::cout << "no device found\n"; return -1; } - //setDevice(oclinfo[1]); + //setDevice(oclinfo[2]); return RUN_ALL_TESTS(); } diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index da18e24..cbad59e 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -1065,23 +1065,13 @@ TEST_P(Sum, MAT) sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); //check results - EXPECT_DOUBLE_EQ(cpures[0], gpures[0]) << sss; - EXPECT_DOUBLE_EQ(cpures[1], gpures[1]) << sss; - EXPECT_DOUBLE_EQ(cpures[2], gpures[2]) << sss; - EXPECT_DOUBLE_EQ(cpures[3], gpures[3]) << sss; + EXPECT_NEAR(cpures[0], gpures[0], 0.1) << sss; + EXPECT_NEAR(cpures[1], gpures[1], 0.1) << sss; + EXPECT_NEAR(cpures[2], gpures[2], 0.1) << sss; + EXPECT_NEAR(cpures[3], gpures[3], 0.1) << sss; } } -//TEST_P(Sum, MASK) -//{ -// for(int j=0; j=radius) && (dsty >= radius) && (dstx+cldst_roi.cols+radius <=cldst_roi.wholecols) && (dsty+cldst_roi.rows+radius <= cldst_roi.wholerows)) + { + dst_roi.adjustROI(radius, radius, radius, radius); + cldst_roi.adjustROI(radius, radius, radius, radius); + } + else + { + continue; + } + #endif + cv::bilateralFilter(mat1_roi, dst_roi, d, sigmacolor, sigmaspace, bordertype[i]|cv::BORDER_ISOLATED); + cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d, sigmacolor, sigmaspace, bordertype[i]|cv::BORDER_ISOLATED); cv::Mat cpu_cldst; - cldst.download(cpu_cldst); - char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,dst1x=%d,dst1y=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, dst1x, dst1y, maskx, masky, src2x, src2y); + #ifndef RANDOMROI + cldst_roi.download(cpu_cldst); + #else + cldst.download(cpu_cldst); + #endif - EXPECT_MAT_NEAR(dst, cpu_cldst, 0.0, sss); + char sss[1024]; + sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,radius=%d,boredertype=%s", roicols, roirows, src1x, src1y, dstx, dsty, radius, borderstr[i]); + + #ifndef RANDOMROI + EXPECT_MAT_NEAR(dst_roi, cpu_cldst, 0.0, sss); + #else + //for(int i=0;i(i,j)<<" "<< (int)cpu_cldst.at(i,j)<<" "; + // } + // cout<get_rng(); + int top = rng.uniform(0, 10); + int bottom = rng.uniform(0, 10); + int left = rng.uniform(0, 10); + int right = rng.uniform(0, 10); + if (mat1.type() != dst.type()) { cout << "Unsupported type" << endl; EXPECT_DOUBLE_EQ(0.0, 0.0); @@ -537,15 +573,45 @@ TEST_P(CopyMakeBorder, Mat) for(int j = 0; j < LOOP_TIMES; j++) { random_roi(); - cv::copyMakeBorder(mat1_roi, dst_roi, 7, 5, 5, 7, bordertype[i], cv::Scalar(1.0)); - cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi, 7, 5, 5, 7, bordertype[i], cv::Scalar(1.0)); + #ifdef RANDOMROI + if(((bordertype[i]!=cv::BORDER_CONSTANT) && (bordertype[i]!=cv::BORDER_REPLICATE))&&(mat1_roi.cols<=left) || (mat1_roi.cols<=right) || (mat1_roi.rows <= top) || (mat1_roi.rows <= bottom)) + { + continue; + } + if((dstx>=left) && (dsty >= top) && (dstx+cldst_roi.cols+right <=cldst_roi.wholecols) && (dsty+cldst_roi.rows+bottom <= cldst_roi.wholerows)) + { + dst_roi.adjustROI(top, bottom, left, right); + cldst_roi.adjustROI(top, bottom, left, right); + } + else + { + continue; + } + #endif + cv::copyMakeBorder(mat1_roi, dst_roi, top, bottom, left, right, bordertype[i]| cv::BORDER_ISOLATED, cv::Scalar(1.0)); + cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi, top, bottom, left, right, bordertype[i]| cv::BORDER_ISOLATED, cv::Scalar(1.0)); cv::Mat cpu_cldst; - cldst.download(cpu_cldst); + #ifndef RANDOMROI + cldst_roi.download(cpu_cldst); + #else + cldst.download(cpu_cldst); + #endif char sss[1024]; - sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,dst1x=%d,dst1y=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, dst1x, dst1y, maskx, masky, src2x, src2y); - - EXPECT_MAT_NEAR(dst, cpu_cldst, 0.0, sss); + sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,dst1x=%d,dst1y=%d,top=%d,bottom=%d,left=%d,right=%d, bordertype=%s", roicols, roirows, src1x, src1y, dstx, dsty, dst1x, dst1y, top, bottom, left, right,borderstr[i]); + #ifndef RANDOMROI + EXPECT_MAT_NEAR(dst_roi, cpu_cldst, 0.0, sss); + #else + //for(int i=0;i(i,j)<<" "; + //} + //cout<get_rng(); src_roicols = rng.uniform(1, mat1.cols); src_roirows = rng.uniform(1, mat1.rows); - dst_roicols = rng.uniform(1, dst.cols); - dst_roirows = rng.uniform(1, dst.rows); + dst_roicols = (int)(src_roicols*fx); + dst_roirows = (int)(src_roirows*fy); src1x = rng.uniform(0, mat1.cols - src_roicols); src1y = rng.uniform(0, mat1.rows - src_roirows); dstx = rng.uniform(0, dst.cols - dst_roicols); @@ -1061,13 +1127,16 @@ PARAM_TEST_CASE(Resize, MatType, cv::Size, double, double, int) dstx = 0; dsty = 0; #endif - + dsize.width = dst_roicols; + dsize.height = dst_roirows; mat1_roi = mat1(Rect(src1x, src1y, src_roicols, src_roirows)); dst_roi = dst(Rect(dstx, dsty, dst_roicols, dst_roirows)); gdst_whole = dst; gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows)); + dsize.width = (int)(mat1_roi.size().width * fx); + dsize.height = (int)(mat1_roi.size().height * fy); gmat1 = mat1_roi; } @@ -1082,7 +1151,7 @@ TEST_P(Resize, Mat) // cv::resize(mat1_roi, dst_roi, dsize, fx, fy, interpolation); // cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation); - + if(dst_roicols<1||dst_roirows<1) continue; cv::resize(mat1_roi, dst_roi, dsize, fx, fy, interpolation); cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation); @@ -1592,15 +1661,15 @@ INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine( // NULL_TYPE, // NULL_TYPE, // Values(false))); // Values(false) is the reserved parameter -// -// -//INSTANTIATE_TEST_CASE_P(ImgprocTestBase, CopyMakeBorder, Combine( -// Values(CV_8UC1, CV_8UC3,CV_8UC4, CV_32SC1), -// NULL_TYPE, -// Values(CV_8UC1,CV_8UC3,CV_8UC4,CV_32SC1), -// NULL_TYPE, -// NULL_TYPE, -// Values(false))); // Values(false) is the reserved parameter + + +INSTANTIATE_TEST_CASE_P(ImgprocTestBase, CopyMakeBorder, Combine( + Values(CV_8UC1, CV_8UC4,CV_32SC1, CV_32SC4,CV_32FC1, CV_32FC4), + NULL_TYPE, + Values(CV_8UC1,CV_8UC4,CV_32SC1, CV_32SC4,CV_32FC1, CV_32FC4), + NULL_TYPE, + NULL_TYPE, + Values(false))); // Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerMinEigenVal, Combine( Values(CV_8UC1,CV_32FC1), @@ -1669,11 +1738,11 @@ INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine( Values(cv::TermCriteria(cv::TermCriteria::COUNT+cv::TermCriteria::EPS, 5, 1)) )); -//INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine( -// Values(CV_8UC1, CV_8UC3,CV_8UC4, CV_32FC1, CV_32FC4), -// Values(CV_32FC1, CV_16SC2, CV_32FC2),Values(-1,CV_32FC1), -// Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR), -// Values((int)cv::BORDER_CONSTANT))); +INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine( + Values(CV_8UC1, CV_8UC3,CV_8UC4, CV_32FC1, CV_32FC4), + Values(CV_32FC1, CV_16SC2, CV_32FC2),Values(-1,CV_32FC1), + Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR), + Values((int)cv::BORDER_CONSTANT))); INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine( diff --git a/modules/ocl/test/test_split_merge.cpp b/modules/ocl/test/test_split_merge.cpp index 91e65e4..e4a4f25 100644 --- a/modules/ocl/test/test_split_merge.cpp +++ b/modules/ocl/test/test_split_merge.cpp @@ -180,14 +180,26 @@ TEST_P(Merge, Accuracy) std::vector dev_src; dev_src.push_back(mat1_roi); - dev_src.push_back(mat2_roi); - dev_src.push_back(mat3_roi); - dev_src.push_back(mat4_roi); + + if(channels >= 2) + dev_src.push_back(mat2_roi); + + if(channels >= 3) + dev_src.push_back(mat3_roi); + + if(channels >= 4) + dev_src.push_back(mat4_roi); std::vector dev_gsrc; dev_gsrc.push_back(gmat1); + + if(channels >= 2) dev_gsrc.push_back(gmat2); + + if(channels >= 3) dev_gsrc.push_back(gmat3); + + if(channels >= 4) dev_gsrc.push_back(gmat4); cv::merge(dev_src, dst_roi); @@ -355,9 +367,16 @@ TEST_P(Split, Accuracy) char sss[1024]; sprintf(sss, "roicols=%d,roirows=%d,dst1x =%d,dsty=%d,dst2x =%d,dst2y=%d,dst3x =%d,dst3y=%d,dst4x =%d,dst4y=%d,srcx=%d,srcy=%d", roicols, roirows, dst1x , dst1y, dst2x , dst2y, dst3x , dst3y, dst4x , dst4y, srcx, srcy); + if(channels >= 1) EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.0, sss); + + if(channels >= 2) EXPECT_MAT_NEAR(dst2, cpu_dst2, 0.0, sss); + + if(channels >= 3) EXPECT_MAT_NEAR(dst3, cpu_dst3, 0.0, sss); + + if(channels >= 4) EXPECT_MAT_NEAR(dst4, cpu_dst4, 0.0, sss); } } diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp index 8e43980..e4742c4 100644 --- a/modules/ocl/test/utility.hpp +++ b/modules/ocl/test/utility.hpp @@ -167,7 +167,7 @@ void run_perf_test(); #define ALL_TYPES testing::ValuesIn(all_types()) #define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end)) -#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113)) +#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(1300, 1300)) #define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true)) @@ -212,11 +212,6 @@ void run_perf_test(); } catch( ... ) { std::cout << "||||| Exception catched! |||||\n"; return; } //////// Utility -#ifndef DIFFERENT_SIZES -#else -#undef DIFFERENT_SIZES -#endif -#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000)) #define IMAGE_CHANNELS testing::Values(Channels(1), Channels(3), Channels(4)) #ifndef IMPLEMENT_PARAM_CLASS -- 2.7.4