From cc237b7aba0e70498ad3ebb7f4c579875e2cf20a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 15 Nov 2013 13:48:14 +0400 Subject: [PATCH] generalized ocl::resize for all data types (INTER_NEAREST mode) --- modules/ocl/src/imgproc.cpp | 115 +++++++------------ modules/ocl/src/opencl/imgproc_resize.cl | 185 +++++++------------------------ 2 files changed, 81 insertions(+), 219 deletions(-) diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index ebe94d9..141325b 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -282,96 +282,63 @@ namespace cv static void resize_gpu( const oclMat &src, oclMat &dst, double fx, double fy, int interpolation) { - CV_Assert( (src.channels() == dst.channels()) ); - Context *clCxt = src.clCxt; - float ifx = 1. / fx; - float ify = 1. / fy; - double ifx_d = 1. / fx; - double ify_d = 1. / fy; - int srcStep_in_pixel = src.step1() / src.oclchannels(); - int srcoffset_in_pixel = src.offset / src.elemSize(); - int dstStep_in_pixel = dst.step1() / dst.oclchannels(); - int dstoffset_in_pixel = dst.offset / dst.elemSize(); - - string kernelName; - if (interpolation == INTER_LINEAR) - kernelName = "resizeLN"; - else if (interpolation == INTER_NEAREST) - kernelName = "resizeNN"; + float ifx = 1.f / fx, ify = 1.f / fy; + int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize(); + int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize(); + int ocn = interpolation == INTER_LINEAR ? dst.oclchannels() : -1; + int depth = interpolation == INTER_LINEAR ? dst.depth() : -1; + + const char * const interMap[] = { "NN", "LN", "CUBIC", "AREA", "LAN4" }; + std::string kernelName = std::string("resize") + interMap[interpolation]; + + const char * const typeMap[] = { "uchar", "uchar", "ushort", "ushort", "int", "int", "double" }; + const char * const channelMap[] = { "" , "", "2", "4", "4" }; + std::string buildOption = format("-D %s -D T=%s%s", interMap[interpolation], typeMap[dst.depth()], channelMap[dst.oclchannels()]); //TODO: improve this kernel size_t blkSizeX = 16, blkSizeY = 16; size_t glbSizeX; - if (src.type() == CV_8UC1) + if (src.type() == CV_8UC1 && interpolation == INTER_LINEAR) { size_t cols = (dst.cols + dst.offset % 4 + 3) / 4; glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; } else - glbSizeX = dst.cols % blkSizeX == 0 && dst.cols != 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX; + glbSizeX = dst.cols; - size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY; - size_t globalThreads[3] = {glbSizeX, glbSizeY, 1}; - size_t localThreads[3] = {blkSizeX, blkSizeY, 1}; + size_t globalThreads[3] = { glbSizeX, dst.rows, 1 }; + size_t localThreads[3] = { blkSizeX, blkSizeY, 1 }; - vector< pair > args; - if (interpolation == INTER_NEAREST) - { - args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - if (src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)) - { - args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); - args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); - } - else - { - args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); - args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); - } - } - else - { - args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); - args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); - } + std::vector< std::pair > args; + args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset)); + args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&src_step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); + args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); + args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); + args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); - openCLExecuteKernel(clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth()); + openCLExecuteKernel(src.clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, + ocn, depth, buildOption.c_str()); } - void resize(const oclMat &src, oclMat &dst, Size dsize, - double fx, double fy, int interpolation) + void resize(const oclMat &src, oclMat &dst, Size dsize, double fx, double fy, int interpolation) { CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4 || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4); CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST); - CV_Assert( src.size().area() > 0 ); - CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) ); - - if (!(dsize == Size()) && (fx > 0 && fy > 0)) - if (dsize.width != (int)(src.cols * fx) || dsize.height != (int)(src.rows * fy)) - CV_Error(CV_StsUnmatchedSizes, "invalid dsize and fx, fy!"); + CV_Assert(dsize.area() > 0 || (fx > 0 && fy > 0)); - if ( dsize == Size() ) + if (dsize.area() == 0) + { dsize = Size(saturate_cast(src.cols * fx), saturate_cast(src.rows * fy)); + CV_Assert(dsize.area() > 0); + } else { fx = (double)dsize.width / src.cols; @@ -380,13 +347,7 @@ namespace cv dst.create(dsize, src.type()); - if ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ) - { - resize_gpu( src, dst, fx, fy, interpolation); - return; - } - - CV_Error(CV_StsUnsupportedFormat, "Non-supported interpolation method"); + resize_gpu( src, dst, fx, fy, interpolation); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/ocl/src/opencl/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl index 0d4cbed..4af9000 100644 --- a/modules/ocl/src/opencl/imgproc_resize.cl +++ b/modules/ocl/src/opencl/imgproc_resize.cl @@ -45,7 +45,7 @@ // resize kernel -// Currently, CV_8UC1 CV_8UC4 CV_32FC1 and CV_32FC4are supported. +// Currently, CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 are supported. // We shall support other types later if necessary. #ifdef DOUBLE_SUPPORT @@ -54,20 +54,18 @@ #elif defined (cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif -#define F double -#else -#define F float #endif - #define INTER_RESIZE_COEF_BITS 11 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1) #define CAST_SCALE (1.0f/(1<= (l) ? (x):((x)+1)) +#ifdef LN + __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src, - int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, + int dst_offset, int src_offset,int dst_step, int src_step, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int gx = get_global_id(0); @@ -75,7 +73,7 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri float4 sx, u, xf; int4 x, DX; - gx = (gx<<2) - (dstoffset_in_pixel&3); + gx = (gx<<2) - (dst_offset&3); DX = (int4)(gx, gx+1, gx+2, gx+3); sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f; xf = floor(sx); @@ -113,10 +111,10 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri int4 val1, val2, val; int4 sdata1, sdata2, sdata3, sdata4; - int4 pos1 = mad24((int4)y, (int4)srcstep_in_pixel, x+(int4)srcoffset_in_pixel); - int4 pos2 = mad24((int4)y, (int4)srcstep_in_pixel, x_+(int4)srcoffset_in_pixel); - int4 pos3 = mad24((int4)y_, (int4)srcstep_in_pixel, x+(int4)srcoffset_in_pixel); - int4 pos4 = mad24((int4)y_, (int4)srcstep_in_pixel, x_+(int4)srcoffset_in_pixel); + int4 pos1 = mad24((int4)y, (int4)src_step, x+(int4)src_offset); + int4 pos2 = mad24((int4)y, (int4)src_step, x_+(int4)src_offset); + int4 pos3 = mad24((int4)y_, (int4)src_step, x+(int4)src_offset); + int4 pos4 = mad24((int4)y_, (int4)src_step, x_+(int4)src_offset); sdata1.s0 = src[pos1.s0]; sdata1.s1 = src[pos1.s1]; @@ -144,12 +142,12 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS); - pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); + pos4 = mad24(dy, dst_step, gx+dst_offset); pos4.y++; pos4.z+=2; pos4.w+=3; uchar4 uval = convert_uchar4_sat(val); - int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0); + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dst_offset&3)==0); if(con) { *(__global uchar4*)(dst + pos4.x)=uval; @@ -176,7 +174,7 @@ __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restri } __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, - int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, + int dst_offset, int src_offset,int dst_step, int src_step, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -202,24 +200,24 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); int4 srcpos; - srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); - srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); - srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); - srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.x = mad24(y, src_step, x+src_offset); + srcpos.y = mad24(y, src_step, x_+src_offset); + srcpos.z = mad24(y_, src_step, x+src_offset); + srcpos.w = mad24(y_, src_step, x_+src_offset); int4 data0 = convert_int4(src[srcpos.x]); int4 data1 = convert_int4(src[srcpos.y]); int4 data2 = convert_int4(src[srcpos.z]); int4 data3 = convert_int4(src[srcpos.w]); int4 val = mul24((int4)mul24(U1, V1) , data0) + mul24((int4)mul24(U, V1) , data1) +mul24((int4)mul24(U1, V) , data2)+mul24((int4)mul24(U, V) , data3); - int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); + int dstpos = mad24(dy, dst_step, dx+dst_offset); uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS); if(dx>=0 && dx=0 && dy=0 && dx=0 && dy=0 && dx=0 && dy= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dstoffset_in_pixel&3)==0); - if(con) - { - *(__global uchar4*)(dst + pos.x)=val; - } - else - { - if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows) - { - dst[pos.x]=val.x; - } - if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows) - { - dst[pos.y]=val.y; - } - if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows) - { - dst[pos.z]=val.z; - } - if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows) - { - dst[pos.w]=val.w; - } - } -} - -__kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src, - int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, - int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - F s1 = dx*ifx; - F s2 = dy*ify; - int sx = fmin((float)floor(s1), (float)src_cols-1); - int sy = fmin((float)floor(s2), (float)src_rows-1); - int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); - int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); - - if(dx>=0 && dx=0 && dy=0 && dx=0 && dy=0 && dx=0 && dy