From 2cc4cf3644d4794dd56c1df60623b098d508523a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 30 May 2014 20:12:22 +0400 Subject: [PATCH] optimized cv::warpAffine --- modules/imgproc/src/imgwarp.cpp | 21 +++-- modules/imgproc/src/opencl/warp_affine.cl | 152 +++++++++++++++++------------- 2 files changed, 100 insertions(+), 73 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index a0162d5..c5cda28 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -4166,11 +4166,12 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, int op_type) { CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE); + const ocl::Device & dev = ocl::Device::getDefault(); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - double doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + double doubleSupport = dev.doubleFPConfig() > 0; - int interpolation = flags & INTER_MAX; + int interpolation = flags & INTER_MAX, rowsPerWI = dev.isIntel() && interpolation <= INTER_LINEAR ? 4 : 1; if( interpolation == INTER_AREA ) interpolation = INTER_LINEAR; @@ -4192,30 +4193,30 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, String opts; if (interpolation == INTER_NEAREST) { - opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d", ocl::typeToStr(type), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", + opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d", + ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(CV_MAT_DEPTH(type)), - ocl::typeToStr(sctype), - cn); + ocl::typeToStr(sctype), cn, rowsPerWI); } else { char cvt[2][50]; - opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s -D cn=%d", + opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d" + " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d", interpolationMap[interpolation], ocl::typeToStr(type), ocl::typeToStr(CV_MAT_DEPTH(type)), ocl::typeToStr(sctype), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth, ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI); } k.create(kernelName, program, opts); if (k.empty()) return false; - double borderBuf[] = {0, 0, 0, 0}; + double borderBuf[] = { 0, 0, 0, 0 }; scalarToRawData(borderValue, borderBuf, sctype); UMat src = _src.getUMat(), M0; @@ -4250,7 +4251,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0), ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); - size_t globalThreads[2] = { dst.cols, dst.rows }; + size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalThreads, NULL, false); } diff --git a/modules/imgproc/src/opencl/warp_affine.cl b/modules/imgproc/src/opencl/warp_affine.cl index 028e873..bb041d1 100644 --- a/modules/imgproc/src/opencl/warp_affine.cl +++ b/modules/imgproc/src/opencl/warp_affine.cl @@ -91,29 +91,32 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of __constant CT * M, ST scalar_) { int dx = get_global_id(0); - int dy = get_global_id(1); + int dy0 = get_global_id(1) * rowsPerWI; - if (dx < dst_cols && dy < dst_rows) + if (dx < dst_cols) { int round_delta = (AB_SCALE >> 1); - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx = convert_short_sat(X0 >> AB_BITS); - short sy = convert_short_sat(Y0 >> AB_BITS); + int X0_ = rint(M[0] * dx * AB_SCALE); + int Y0_ = rint(M[3] * dx * AB_SCALE); + int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset)); - int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); - - if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) + for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step) { - int src_index = mad24(sy, src_step, src_offset + sx * pixsize); - storepix(loadpix(srcptr + src_index), dstptr + dst_index); + int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; + int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; + + short sx = convert_short_sat(X0 >> AB_BITS); + short sy = convert_short_sat(Y0 >> AB_BITS); + + if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) + { + int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset)); + storepix(loadpix(srcptr + src_index), dstptr + dst_index); + } + else + storepix(scalar, dstptr + dst_index); } - else - storepix(scalar, dstptr + dst_index); } } @@ -124,52 +127,63 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of __constant CT * M, ST scalar_) { int dx = get_global_id(0); - int dy = get_global_id(1); + int dy0 = get_global_id(1) * rowsPerWI; - if (dx < dst_cols && dy < dst_rows) + if (dx < dst_cols) { int round_delta = AB_SCALE/INTER_TAB_SIZE/2; int tmp = (dx << AB_BITS); - int X0 = rint(M[0] * tmp); - int Y0 = rint(M[3] * tmp); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - short sx = convert_short_sat(X0 >> INTER_BITS); - short sy = convert_short_sat(Y0 >> INTER_BITS); - short ax = convert_short(X0 & (INTER_TAB_SIZE-1)); - short ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); - - WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? - convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar; - WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? - convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar; - WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? - convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar; - WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? - convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar; + int X0_ = rint(M[0] * tmp); + int Y0_ = rint(M[3] * tmp); - float taby = 1.f/INTER_TAB_SIZE*ay; - float tabx = 1.f/INTER_TAB_SIZE*ax; - - int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); + for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy) + { + int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; + int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; + X0 = X0 >> (AB_BITS - INTER_BITS); + Y0 = Y0 >> (AB_BITS - INTER_BITS); + + short sx = convert_short_sat(X0 >> INTER_BITS); + short sy = convert_short_sat(Y0 >> INTER_BITS); + short ax = convert_short(X0 & (INTER_TAB_SIZE-1)); + short ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); + + WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar; + if (sx >= 0 && sx < src_cols) + { + if (sy >= 0 && sy < src_rows) + v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset)))); + if (sy+1 >= 0 && sy+1 < src_rows) + v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset)))); + } + if (sx+1 >= 0 && sx+1 < src_cols) + { + if (sy >= 0 && sy < src_rows) + v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset)))); + if (sy+1 >= 0 && sy+1 < src_rows) + v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset)))); + } + + float taby = 1.f/INTER_TAB_SIZE*ay; + float tabx = 1.f/INTER_TAB_SIZE*ax; + + int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); #if depth <= 4 - int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); - int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ); - int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); - int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); + int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); + int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ); + int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); + int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); - WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; - storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); + WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3))); + storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); #else - float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; - WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby; - storepix(convertToT(val), dstptr + dst_index); + float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; + WT val = fma(v0, tabx2 * taby2, fma(v1, tabx * taby2, fma(v2, tabx2 * taby, v3 * tabx * taby))); + storepix(convertToT(val), dstptr + dst_index); #endif + } } } @@ -179,9 +193,9 @@ inline void interpolateCubic( float x, float* coeffs ) { const float A = -0.75f; - coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; - coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; - coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; + coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A); + coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f); + coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f); coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; } @@ -199,8 +213,9 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of int tmp = (dx << AB_BITS); int X0 = rint(M[0] * tmp); int Y0 = rint(M[3] * tmp); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; + + X0 += rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; + Y0 += rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; X0 = X0 >> (AB_BITS - INTER_BITS); Y0 = Y0 >> (AB_BITS - INTER_BITS); @@ -212,10 +227,21 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of WT v[16]; #pragma unroll for (int y = 0; y < 4; y++) - #pragma unroll - for (int x = 0; x < 4; x++) - v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ? - convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar; + { + if (sy+y >= 0 && sy+y < src_rows) + { + #pragma unroll + for (int x = 0; x < 4; x++) + v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ? + convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar; + } + else + { + #pragma unroll + for (int x = 0; x < 4; x++) + v[mad24(y, 4, x)] = scalar; + } + } float tab1y[4], tab1x[4]; @@ -224,7 +250,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of interpolateCubic(ayy, tab1y); interpolateCubic(axx, tab1x); - int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); + int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); WT sum = (WT)(0); #if depth <= 4 @@ -236,12 +262,12 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of #pragma unroll for (int i = 0; i < 16; i++) - sum += v[i] * itab[i]; + sum = mad24(v[i], itab[i], sum); storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index); #else #pragma unroll for (int i = 0; i < 16; i++) - sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)]; + sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum); storepix(convertToT( sum ), dstptr + dst_index); #endif } -- 2.7.4