From: niko Date: Fri, 17 Aug 2012 07:47:02 +0000 (+0800) Subject: performance & bug fix for resize erode dilate sobel remap X-Git-Tag: accepted/2.0/20130307.220821~364^2~215^2~52 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2e685dcf0a74c185db1d7fdf5b460ecbba86dcc0;p=profile%2Fivi%2Fopencv.git performance & bug fix for resize erode dilate sobel remap --- diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt index a6496ae..994353b 100644 --- a/modules/ocl/CMakeLists.txt +++ b/modules/ocl/CMakeLists.txt @@ -4,7 +4,7 @@ if(NOT HAVE_OPENCL) endif() set(the_description "OpenCL-accelerated Computer Vision") -ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree) +ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_ts) ocv_module_include_directories() diff --git a/modules/ocl/perf/test_filters.cpp b/modules/ocl/perf/test_filters.cpp index ac9a865..1c113fb 100644 --- a/modules/ocl/perf/test_filters.cpp +++ b/modules/ocl/perf/test_filters.cpp @@ -325,7 +325,7 @@ PARAM_TEST_CASE(LaplacianTestBase, MatType, int) ksize = GET_PARAM(1); cv::RNG& rng = TS::ptr()->get_rng(); - cv::Size size = cv::Size(2560, 2560); + cv::Size size = cv::Size(MWIDTH, MHEIGHT); mat = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); @@ -468,7 +468,7 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, bool) // iterations = GET_PARAM(1); cv::RNG& rng = TS::ptr()->get_rng(); - cv::Size size = cv::Size(2560, 2560); + cv::Size size = cv::Size(MWIDTH, MHEIGHT); mat1 = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); @@ -679,7 +679,7 @@ PARAM_TEST_CASE(Sobel, MatType, int, int, int, int) dx = 2; dy=0; cv::RNG& rng = TS::ptr()->get_rng(); - cv::Size size = cv::Size(2560, 2560); + cv::Size size = cv::Size(MWIDTH, MHEIGHT); mat1 = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); @@ -817,7 +817,7 @@ PARAM_TEST_CASE(Scharr, MatType, int, int, int) dx = 1; dy=0; cv::RNG& rng = TS::ptr()->get_rng(); - cv::Size size = cv::Size(2560, 2560); + cv::Size size = cv::Size(MWIDTH, MHEIGHT); mat1 = randomMat(rng, size, type, 5, 16, false); dst = randomMat(rng, size, type, 5, 16, false); @@ -956,7 +956,7 @@ PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int) bordertype = GET_PARAM(2); cv::RNG& rng = TS::ptr()->get_rng(); - cv::Size size = cv::Size(2560, 2560); + cv::Size size = cv::Size(MWIDTH, MHEIGHT); sigma1 = rng.uniform(0.1, 1.0); sigma2 = rng.uniform(0.1, 1.0); diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index ea7f312..c71c2a2 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -177,9 +177,7 @@ namespace cv extern const char *filter_sep_row; extern const char *filter_sep_col; extern const char *filtering_laplacian; - extern const char *filtering_erodeFilter; - extern const char *filtering_dilateFilter; - + extern const char *filtering_morph; } } @@ -334,28 +332,54 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c int srcStep = src.step1() / src.channels(); int dstStep = dst.step1() / dst.channels(); - int srcOffset = src.offset / src.channels() / src.elemSize1(); - int dstOffset = dst.offset / dst.channels() / dst.elemSize1(); - int minclos = -(srcOffset % srcStep); - int maxclos = src.wholecols + minclos - 1; - int minrows = -(srcOffset / srcStep); - int maxrows = src.wholerows + minrows - 1; - - //int D=src.depth(); + int srcOffset = src.offset / src.elemSize(); + int dstOffset = dst.offset / dst.elemSize(); + int srcOffset_x=srcOffset%srcStep; + int srcOffset_y=srcOffset/srcStep; Context *clCxt = src.clCxt; - - string kernelName = "erode"; - + string kernelName; + size_t localThreads[3] = {16, 16, 1}; + size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] * localThreads[0], (src.rows + localThreads[1]) / localThreads[1] * localThreads[1], 1}; + + if(src.type()==CV_8UC1) + { + kernelName = "morph_C1_D0"; + globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; + CV_Assert( localThreads[0]*localThreads[1]*8 >= (localThreads[0]*4+ksize.width-1)*(localThreads[1]+ksize.height-1) ); + } + else + { + kernelName = "morph"; + CV_Assert( localThreads[0]*localThreads[1]*2 >= (localThreads[0]+ksize.width-1)*(localThreads[1]+ksize.height-1) ); + } + char s[64]; + switch(src.type()) + { + case CV_8UC1: + sprintf(s, "-D VAL=255"); + break; + case CV_8UC3: + case CV_8UC4: + sprintf(s, "-D VAL=255 -D GENTYPE=uchar4"); + break; + case CV_32FC1: + sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float"); + break; + case CV_32FC3: + case CV_32FC4: + sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float4"); + break; + default: + CV_Error(-217,"unsupported type"); + } + char compile_option[128]; + sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s", anchor.x, anchor.y, localThreads[0], localThreads[1],s); vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&minclos)); - args.push_back( make_pair( sizeof(cl_int), (void *)&maxclos)); - args.push_back( make_pair( sizeof(cl_int), (void *)&minrows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&maxrows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); @@ -363,18 +387,8 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholecols)); args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholerows)); - //args.push_back( make_pair( sizeof(cl_int),(void*)&ksize.width)); - //args.push_back( make_pair( sizeof(cl_int),(void*)&ksize.height)); - - size_t globalThreads[3] = {(src.cols + 15) / 16 * 16, (src.rows + 15) / 16 * 16, 1}; - if(src.channels() == 1) - globalThreads[0] = ((src.cols + 9) / 4 + 15) / 16 * 16; - size_t localThreads[3] = {16, 16, 1}; - - char compile_option[128]; - sprintf(compile_option, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d", anchor.x, anchor.y, ksize.width, ksize.height); - - openCLExecuteKernel(clCxt, &filtering_erodeFilter, kernelName, globalThreads, localThreads, args, src.channels(), src.depth(), compile_option); + args.push_back( make_pair( sizeof(cl_int),(void*)&dstOffset)); + openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } @@ -390,26 +404,54 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, int srcStep = src.step1() / src.channels(); int dstStep = dst.step1() / dst.channels(); - int srcOffset = src.offset / src.channels() / src.elemSize1(); - int dstOffset = dst.offset / dst.channels() / dst.elemSize1(); - int minclos = -(srcOffset % srcStep); - int maxclos = src.wholecols + minclos - 1; - int minrows = -(srcOffset / srcStep); - int maxrows = src.wholerows + minrows - 1; - + int srcOffset = src.offset / src.elemSize(); + int dstOffset = dst.offset / dst.elemSize(); + int srcOffset_x=srcOffset%srcStep; + int srcOffset_y=srcOffset/srcStep; Context *clCxt = src.clCxt; - - string kernelName = "dilate"; + string kernelName; + size_t localThreads[3] = {16, 16, 1}; + size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] * localThreads[0], (src.rows + localThreads[1]) / localThreads[1] * localThreads[1], 1}; + + if(src.type()==CV_8UC1) + { + kernelName = "morph_C1_D0"; + globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; + CV_Assert( localThreads[0]*localThreads[1]*8 >= (localThreads[0]*4+ksize.width-1)*(localThreads[1]+ksize.height-1) ); + } + else + { + kernelName = "morph"; + CV_Assert( localThreads[0]*localThreads[1]*2 >= (localThreads[0]+ksize.width-1)*(localThreads[1]+ksize.height-1) ); + } + char s[64]; + switch(src.type()) + { + case CV_8UC1: + sprintf(s, "-D VAL=0"); + break; + case CV_8UC3: + case CV_8UC4: + sprintf(s, "-D VAL=0 -D GENTYPE=uchar4"); + break; + case CV_32FC1: + sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float"); + break; + case CV_32FC3: + case CV_32FC4: + sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float4"); + break; + default: + CV_Error(-217,"unsupported type"); + } + char compile_option[128]; + sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s", anchor.x, anchor.y, localThreads[0], localThreads[1],s); vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&minclos)); - args.push_back( make_pair( sizeof(cl_int), (void *)&maxclos)); - args.push_back( make_pair( sizeof(cl_int), (void *)&minrows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&maxrows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_x)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset_y)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); @@ -417,15 +459,8 @@ void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_kernel.data)); args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholecols)); args.push_back( make_pair( sizeof(cl_int),(void*)&src.wholerows)); - - size_t globalThreads[3] = {(src.cols + 15) / 16 * 16, (src.rows + 15) / 16 * 16, 1}; - if(src.channels() == 1) - globalThreads[0] = ((src.cols + 9) / 4 + 15) / 16 * 16; - size_t localThreads[3] = {16, 16, 1}; - char compile_option[128]; - sprintf(compile_option, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d", anchor.x, anchor.y, ksize.width, ksize.height); - - openCLExecuteKernel(clCxt, &filtering_dilateFilter, kernelName, globalThreads, localThreads, args, src.channels(), src.depth(), compile_option); + args.push_back( make_pair( sizeof(cl_int),(void*)&dstOffset)); + openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } Ptr cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor) @@ -739,7 +774,7 @@ namespace int src_type = src.type(); int cn = src.channels(); - dst.create(src_size, src_type); + //dst.create(src_size, src_type); dst = Scalar(0.0); //dstBuf.create(src_size, src_type); dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn)); @@ -1265,8 +1300,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker sprintf(btype, "BORDER_REFLECT_101"); break; } - char compile_option[128]; - sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, localThreads[0], localThreads[1], channels, btype); + char compile_option[256]; + size_t globalThreads[3]; globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; @@ -1277,21 +1312,46 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker { case 1: globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float","uchar","convert_uchar_sat"); break; case 2: globalThreads[0] = ((dst.cols + 1) / 2 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float2","uchar2","convert_uchar2_sat"); break; case 3: - globalThreads[0] = ((dst.cols * 3 + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - break; case 4: globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float4","uchar4","convert_uchar4_sat"); break; } } else { globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + switch(dst.type()) + { + case CV_32SC1: + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float","int","convert_int_sat"); + break; + case CV_32SC3: + case CV_32SC4: + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float4","int4","convert_int4_sat"); + break; + case CV_32FC1: + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float","float",""); + break; + case CV_32FC3: + case CV_32FC4: + sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, localThreads[0], localThreads[1], channels, btype,"float4","float4",""); + break; + } } //sanity checks @@ -1321,7 +1381,7 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); - openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, channels, dst.depth(), compile_option); + openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); } Ptr cv::ocl::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, int anchor, int bordertype, double delta) @@ -1376,7 +1436,7 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat { if( ddepth < 0 ) ddepth = src.depth(); - CV_Assert(ddepth == src.depth()); + //CV_Assert(ddepth == src.depth()); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype); diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 7617c08..98ab98e 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -260,7 +260,7 @@ namespace cv CV_Assert((!map2.data || map2.size()== map1.size())); dst.create(map1.size(), src.type()); - + string kernelName; @@ -394,8 +394,15 @@ namespace cv args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols)); args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); - args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue)); - } + if(src.clCxt -> impl -> double_support != 0) + { + args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue)); + } + else + { + args.push_back( make_pair(sizeof(cl_float4),(void*)&borderValue)); + } + } openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth()); } diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index 61e7177..fab81fd 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -44,9 +44,9 @@ //M*/ #include "precomp.hpp" -#include "threadsafe.h" +#include "Threadsafe.h" #include -#include "binarycaching.hpp" +#include "binaryCaching.hpp" using namespace cv; using namespace cv::ocl; diff --git a/modules/ocl/src/kernels/filter_sep_col.cl b/modules/ocl/src/kernels/filter_sep_col.cl index f85906a..bfb8cac 100644 --- a/modules/ocl/src/kernels/filter_sep_col.cl +++ b/modules/ocl/src/kernels/filter_sep_col.cl @@ -90,9 +90,9 @@ Niko ***********************************************************************************/ -__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D0 - (__global const float * restrict src, - __global uchar * dst, +__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter + (__global const GENTYPE_SRC * restrict src, + __global GENTYPE_DST * dst, const int dst_cols, const int dst_rows, const int src_whole_cols, @@ -111,10 +111,10 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_ int start_addr = mad24(y,src_step_in_pixel,x); int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); int i; - float sum; - float temp[READ_TIMES_COL]; + GENTYPE_SRC sum; + GENTYPE_SRC temp[READ_TIMES_COL]; - __local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1]; + __local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1]; //read pixels from src for(i = 0;i 0)) ? current_addr : 0; - float4 v = src[current_addr]; - uchar now = mat_kernel[k++]; - float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(-FLT_MAX); - maxVal = max(maxVal , flag); - } - } - - if(mX < cols && mY < rows) - dst[mY * dstStep + mX + dstOffset] = (maxVal); -} - -__kernel void dilate_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = (get_global_id(0)<<2) - (dstOffset&3); - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - float4 maxVal = (float4)(-FLT_MAX); - int k=0; - for(int i=0;i 0)) ? start : 0; - int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0; - float8 sVal = (float8)(src[start>>2], src[start2>>2]); - - float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7}; - int det = start & 3; - float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]); - uchar now = mat_kernel[k++]; - float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal; - flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : -FLT_MAX; - flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : -FLT_MAX; - flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : -FLT_MAX; - flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : -FLT_MAX; - - maxVal = max(maxVal , flag); - } - } - if(mY < rows && mX < cols) - { - __global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset); - float4 dVal = *d; - maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x; - maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y; - maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z; - maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w; - - *d = (maxVal); - } -} - -__kernel void dilate_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = (get_global_id(0)<<2) - (dstOffset&3);; - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - uchar4 maxVal = (uchar4)(UCHAR_MIN); - int k=0; - for(int i=0;i 0)) ? start : 0; - int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0; - uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]); - - uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7}; - int det = start & 3; - uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]); - - uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : maxVal; - flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : UCHAR_MIN; - flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : UCHAR_MIN; - flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : UCHAR_MIN; - flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : UCHAR_MIN; - - maxVal = max(maxVal , flag); - } - } - if(mY < rows) - { - __global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset); - uchar4 dVal = *d; - - maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x; - maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y; - maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z; - maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w; - - *d = (maxVal); - } -} - -__kernel void dilate_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = get_global_id(0); - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - uchar4 maxVal = (uchar4)(UCHAR_MIN); - int k=0; - for(int i=0;i 0)) ? current_addr : 0; - uchar4 v = src[current_addr]; - uchar now = mat_kernel[k++]; - uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal; - maxVal = max(maxVal , flag); - } - } - - if(mX < cols && mY < rows) - dst[mY * dstStep + mX + dstOffset] = (maxVal); -} - diff --git a/modules/ocl/src/kernels/filtering_erodeFilter.cl b/modules/ocl/src/kernels/filtering_erodeFilter.cl deleted file mode 100644 index 1714fb0..0000000 --- a/modules/ocl/src/kernels/filtering_erodeFilter.cl +++ /dev/null @@ -1,183 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Niko Li, newlife20080214@gmail.com -// Zero Lin, zero.lin@amd.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// - -__kernel void erode_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = get_global_id(0); - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - float4 minVal = (float4)(3.4e+38); - int k=0; - for(int i=0;i 0)) ? current_addr : 0; - float4 v = src[current_addr]; - uchar now = mat_kernel[k++]; - float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38); - minVal = min(minVal , flag); - } - } - - if(mX < cols && mY < rows) - dst[mY * dstStep + mX + dstOffset] = (minVal); -} - -__kernel void erode_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = (get_global_id(0)<<2) - (dstOffset&3); - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - float4 minVal = (float4)(3.4e+38); - int k=0; - for(int i=0;i 0)) ? start : 0; - int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0; - float8 sVal = (float8)(src[start>>2], src[start2>>2]); - - float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7}; - int det = start & 3; - float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]); - uchar now = mat_kernel[k++]; - float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38); - flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 3.4e+38; - flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 3.4e+38; - flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 3.4e+38; - flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 3.4e+38; - - minVal = min(minVal , flag); - } - } - - if(mY < rows && mX < cols) - { - __global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset); - float4 dVal = *d; - minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x; - minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y; - minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z; - minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w; - - *d = (minVal); - } -} - -__kernel void erode_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = (get_global_id(0)<<2) - (dstOffset&3); - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - uchar4 minVal = (uchar4)(0xff); - int k=0; - for(int i=0;i 0)) ? start : 0; - int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0; - uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]); - - uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7}; - int det = start & 3; - uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]); - - uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : (uchar4)(0xff); - flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 0xff; - flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 0xff; - flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 0xff; - flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 0xff; - - minVal = min(minVal , flag); - } - } - - if(mY < rows) - { - __global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset); - uchar4 dVal = *d; - - minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x; - minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y; - minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z; - minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w; - - *d = (minVal); - } -} - -__kernel void erode_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset, - int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, - int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows) -{ - int mX = get_global_id(0); - int mY = get_global_id(1); - int kX = mX - anX, kY = mY - anY; - int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols); - uchar4 minVal = (uchar4)(0xff); - int k=0; - for(int i=0;i 0)) ? current_addr : 0; - uchar4 v = src[current_addr]; - uchar now = mat_kernel[k++]; - uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (uchar4)(0xff); - minVal = min(minVal , flag); - } - } - - if(mX < cols && mY < rows) - dst[mY * dstStep + mX + dstOffset] = (minVal); -} - diff --git a/modules/ocl/src/kernels/filtering_morph.cl b/modules/ocl/src/kernels/filtering_morph.cl new file mode 100644 index 0000000..b008cb5 --- /dev/null +++ b/modules/ocl/src/kernels/filtering_morph.cl @@ -0,0 +1,204 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Zero Lin, zero.lin@amd.com +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// + + +#ifdef ERODE +#define MORPH_OP(A,B) min((A),(B)) +#endif +#ifdef DILATE +#define MORPH_OP(A,B) max((A),(B)) +#endif +//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii +#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) +#ifndef GENTYPE +__kernel void morph_C1_D0(__global const uchar * restrict src, + __global uchar *dst, + int src_offset_x, int src_offset_y, + int cols, int rows, + int src_step_in_pixel, int dst_step_in_pixel, + __constant uchar * mat_kernel, + int src_whole_cols, int src_whole_rows, + int dst_offset_in_pixel) +{ + int l_x = get_local_id(0); + int l_y = get_local_id(1); + int x = get_group_id(0)*4*LSIZE0; + int y = get_group_id(1)*LSIZE1; + int start_x = x+src_offset_x-RADIUSX & 0xfffffffc; + int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc; + int width = (end_x -start_x+4)>>2; + int offset = src_offset_x-RADIUSX & 3; + int start_y = y+src_offset_y-RADIUSY; + int point1 = mad24(l_y,LSIZE0,l_x); + int point2 = point1 + LSIZE0*LSIZE1; + int tl_x = (point1 % width)<<2; + int tl_y = point1 / width; + int tl_x2 = (point2 % width)<<2; + int tl_y2 = point2 / width; + int cur_x = start_x + tl_x; + int cur_y = start_y + tl_y; + int cur_x2 = start_x + tl_x2; + int cur_y2 = start_y + tl_y2; + int start_addr = mad24(cur_y,src_step_in_pixel,cur_x); + int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2); + uchar4 temp0,temp1; + __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0]; + + int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); + //read pixels from src + start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0; + start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; + temp0 = *(__global uchar4*)&src[start_addr]; + temp1 = *(__global uchar4*)&src[start_addr2]; + //judge if read out of boundary + temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x); + temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y); + temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z); + temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w); + temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0); + + temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x); + temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y); + temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z); + temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w); + temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1); + + LDS_DAT[point1] = temp0; + LDS_DAT[point2] = temp1; + barrier(CLK_LOCAL_MEM_FENCE); + uchar4 res = (uchar4)VAL; + for(int i=0;i<2*RADIUSY+1;i++) + for(int j=0;j<2*RADIUSX+1;j++) + { + res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)):res; + } + int gidx = get_global_id(0)<<2; + int gidy = get_global_id(1); + int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); + if(gidx+3 0)) ? start_addr : 0; + start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; + temp0 = src[start_addr]; + temp1 = src[start_addr2]; + //judge if read out of boundary + temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0); + temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0); + + temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1); + temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1); + + LDS_DAT[point1] = temp0; + LDS_DAT[point2] = temp1; + barrier(CLK_LOCAL_MEM_FENCE); + GENTYPE res = (GENTYPE)VAL; + for(int i=0;i<2*RADIUSY+1;i++) + for(int j=0;j<2*RADIUSX+1;j++) + { + res =mat_kernel[i*(2*RADIUSX+1)+j]? MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]):res; + } + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); + if(gidx= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal; *d = dst_data; @@ -139,8 +139,8 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig uchar4 dVal = *d; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; - + + dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal; *d = dst_data; } @@ -167,7 +167,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig short8 map1_data; map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset; uchar4 src_a, src_b, src_c, src_d; src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); @@ -187,7 +187,7 @@ __kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsig uchar16 dVal = *d; int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal; *d = dst_data; } @@ -216,7 +216,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); int8 map1_dataZ = convert_int8_sat_rte(map1_data); - int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<2) + src_offset; + int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset; uchar4 src_a, src_b, src_c, src_d; src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); @@ -236,7 +236,7 @@ __kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsig uchar16 dVal = *d; int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal; *d = dst_data; @@ -269,7 +269,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset; float4 src_data; src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); @@ -289,7 +289,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * float4 dVal = *d; int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; *d = dst_data; @@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); int8 map1_dataZ = convert_int8_sat_rte(map1_data); - int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<2) + src_offset; + int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset; float4 src_data; src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); @@ -341,7 +341,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * float4 dVal = *d; int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; *d = dst_data; @@ -418,21 +418,21 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig float4 u = temp.even; float4 v = temp.odd; - float4 ud = 1.f - u; - float4 vd = 1.f - v; + float4 ud = (float4)(1.0) - u; + float4 vd = (float4)(1.0) - v; //float8 map1_dataU = map1_dataD + 1; int4 map1_dataDx = map1_dataD.even; int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + 1; - int4 map1_dataDy1 = map1_dataDy + 1; + int4 map1_dataDx1 = map1_dataDx + (int4)(1); + int4 map1_dataDy1 = map1_dataDy + (int4)(1); int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset; int4 src_StartD = src_StartU + src_step; /* //not using the vload - int4 src_StartU1 = src_StartU + 1; - int4 src_StartD1 = src_StartD + 1; + int4 src_StartU1 = src_StartU + (int4)(1); + int4 src_StartD1 = src_StartD + (int4)(1); uchar4 a, b, c, d; a.x = *(src_StartU.x + src); @@ -476,10 +476,10 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - a = (convert_uchar4(ac) == (uchar4)0)? a : val; - b = (convert_uchar4(bc) == (uchar4)0)? b : val; - c = (convert_uchar4(cc) == (uchar4)0)? c : val; - d = (convert_uchar4(dc) == (uchar4)0)? d : val; + a = (convert_uchar4(ac) == (uchar4)(0))? a : val; + b = (convert_uchar4(bc) == (uchar4)(0))? b : val; + c = (convert_uchar4(cc) == (uchar4)(0))? c : val; + d = (convert_uchar4(dc) == (uchar4)(0))? d : val; uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); @@ -487,7 +487,7 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig uchar4 dVal = *D; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; *D = dst_data; } @@ -531,7 +531,7 @@ __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsig uchar4 dVal = *d; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; *d = dst_data; @@ -567,17 +567,17 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig float4 u = temp.even; float4 v = temp.odd; - float4 ud = 1.f - u; - float4 vd = 1.f - v; + float4 ud = (float4)(1.0) - u; + float4 vd = (float4)(1.0) - v; //float8 map1_dataU = map1_dataD + 1; int4 map1_dataDx = map1_dataD.even; int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + 1; - int4 map1_dataDy1 = map1_dataDy + 1; + int4 map1_dataDx1 = map1_dataDx + (int4)(1); + int4 map1_dataDy1 = map1_dataDy + (int4)(1); - int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset; + int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset; int4 src_StartD = src_StartU + src_step; uchar8 aU, bU, cU, dU, aD, bD, cD, dD; @@ -605,10 +605,10 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w)); uchar16 val = (uchar16)(nval, nval, nval, nval); - a = (convert_uchar16(acc) == (uchar16)0)? a : val; - b = (convert_uchar16(bcc) == (uchar16)0)? b : val; - c = (convert_uchar16(ccc) == (uchar16)0)? c : val; - d = (convert_uchar16(dcc) == (uchar16)0)? d : val; + a = (convert_uchar16(acc) == (uchar16)(0))? a : val; + b = (convert_uchar16(bcc) == (uchar16)(0))? b : val; + c = (convert_uchar16(ccc) == (uchar16)(0))? c : val; + d = (convert_uchar16(dcc) == (uchar16)(0))? d : val; float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w)); float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w)); @@ -621,7 +621,7 @@ __kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsig uchar16 dVal = *D; int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal; *D = dst_data; } @@ -646,7 +646,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig short8 map1_data; map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<(int4)(2)) + src_offset; uchar4 src_a, src_b, src_c, src_d; src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); @@ -666,7 +666,7 @@ __kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsig uchar16 dVal = *d; int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal; *d = dst_data; @@ -700,21 +700,21 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * float4 u = temp.even; float4 v = temp.odd; - float4 ud = 1.f - u; - float4 vd = 1.f - v; + float4 ud = (float4)(1.0) - u; + float4 vd = (float4)(1.0) - v; //float8 map1_dataU = map1_dataD + 1; int4 map1_dataDx = map1_dataD.even; int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + 1; - int4 map1_dataDy1 = map1_dataDy + 1; + int4 map1_dataDx1 = map1_dataDx + (int4)(1); + int4 map1_dataDy1 = map1_dataDy + (int4)(1); - int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset; + int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset; int4 src_StartD = src_StartU + src_step; /* //not using the vload - int4 src_StartU1 = src_StartU + 1; - int4 src_StartD1 = src_StartD + 1; + int4 src_StartU1 = src_StartU + (int4)(1); + int4 src_StartD1 = src_StartD + (int4)(1); float4 a, b, c, d; a.x = *(src_StartU.x + src); @@ -754,14 +754,14 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * c = (float4)(aD.x, bD.x, cD.x, dD.x); d = (float4)(aD.y, bD.y, cD.y, dD.y); - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - a = (convert_float4(ac) == 0)? a : val; - b = (convert_float4(bc) == 0)? b : val; - c = (convert_float4(cc) == 0)? c : val; - d = (convert_float4(dc) == 0)? d : val; + int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0)); + int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0)); + int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0)); + int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0)); + a = (convert_float4(ac) == (float4)(0))? a : val; + b = (convert_float4(bc) == (float4)(0))? b : val; + c = (convert_float4(cc) == (float4)(0))? c : val; + d = (convert_float4(dc) == (float4)(0))? d : val; float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; @@ -769,7 +769,7 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * float4 dVal = *D; int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; *D = dst_data; } @@ -798,7 +798,7 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const * map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset; float4 src_data; src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); @@ -818,7 +818,7 @@ __kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const * float4 dVal = *d; int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); - dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; *d = dst_data; @@ -918,7 +918,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only im __global uchar4* d = (__global uchar4 *)(dst + dstStart); uchar4 dVal = *d; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; *d = dst_data; } diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index 2a2d1f7..53c2821 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -588,6 +588,13 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern sprintf(compile_option, "-D GENTYPE=int"); args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); break; + case 2: + sprintf(compile_option, "-D GENTYPE=int2"); + cl_int2 i2val; + i2val.s[0] = val.ival.s[0]; + i2val.s[1] = val.ival.s[1]; + args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val )); + break; case 4: sprintf(compile_option, "-D GENTYPE=int4"); args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); diff --git a/modules/ocl/src/threadsafe.cpp b/modules/ocl/src/threadsafe.cpp index 59b3d7b..9d952d3 100644 --- a/modules/ocl/src/threadsafe.cpp +++ b/modules/ocl/src/threadsafe.cpp @@ -44,7 +44,7 @@ //M*/ #include "precomp.hpp" -#include "threadsafe.h" +#include "Threadsafe.h" CriticalSection::CriticalSection() { diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index 90ff0b4..ff2f441 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -958,7 +958,7 @@ TEST_P(Remap, Mat) if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1)) { cout << "LINEAR don't support the map1Type and map2Type" << endl; - return; + return; } int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};