From: Ilya Lavrenov Date: Thu, 12 Jun 2014 16:10:16 +0000 (+0400) Subject: optimization of cv::warpAffine INTER_CUBIC X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~312^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=93712e913a2af63e4954df206659618d8af5ebba;p=profile%2Fivi%2Fopencv.git optimization of cv::warpAffine INTER_CUBIC --- diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c946afc..34cd2c8 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -4187,7 +4187,8 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective"; int scalarcn = cn == 3 ? 4 : cn; - int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth); + bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR); + int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth); int sctype = CV_MAKETYPE(wdepth, scalarcn); ocl::Kernel k; diff --git a/modules/imgproc/src/opencl/warp_affine.cl b/modules/imgproc/src/opencl/warp_affine.cl index bb041d1..8ee34d0 100644 --- a/modules/imgproc/src/opencl/warp_affine.cl +++ b/modules/imgproc/src/opencl/warp_affine.cl @@ -61,6 +61,7 @@ #define AB_SCALE (1 << AB_BITS) #define INTER_REMAP_COEF_BITS 15 #define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) +#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1)) #define noconvert @@ -122,6 +123,14 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of #elif defined INTER_LINEAR +__constant float coeffs[64] = +{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f, + 0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f, + 0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f, + 0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f, + 0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f, + 0.062500f, 0.937500f, 0.031250f, 0.968750f }; + __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __constant CT * M, ST scalar_) @@ -131,24 +140,21 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of if (dx < dst_cols) { - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - int tmp = (dx << AB_BITS); + int tmp = dx << AB_BITS; int X0_ = rint(M[0] * tmp); int Y0_ = rint(M[3] * tmp); for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy) { - int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; - int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; + int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA; + int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA; X0 = X0 >> (AB_BITS - INTER_BITS); Y0 = Y0 >> (AB_BITS - INTER_BITS); - short sx = convert_short_sat(X0 >> INTER_BITS); - short sy = convert_short_sat(Y0 >> INTER_BITS); - short ax = convert_short(X0 & (INTER_TAB_SIZE-1)); - short ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); + short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS); + short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); +#if defined AMD_DEVICE || depth > 4 WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar; if (sx >= 0 && sx < src_cols) { @@ -180,15 +186,57 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); #else float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; - WT val = fma(v0, tabx2 * taby2, fma(v1, tabx * taby2, fma(v2, tabx2 * taby, v3 * tabx * taby))); + WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby)); storepix(convertToT(val), dstptr + dst_index); #endif +#else // INTEL_DEVICE + __constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1); + + int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index; + int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); + + WT sum = (WT)(0), xsum; + #pragma unroll + for (int y = 0; y < 2; y++) + { + src_index = mad24(y, src_step, src_index0); + if (sy + y >= 0 && sy + y < src_rows) + { + xsum = (WT)(0); + if (sx >= 0 && sx + 2 < src_cols) + { +#if depth == 0 && cn == 1 + uchar2 value = vload2(0, srcptr + src_index); + xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1])); +#else + #pragma unroll + for (int x = 0; x < 2; x++) + xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum); +#endif + } + else + { + #pragma unroll + for (int x = 0; x < 2; x++) + xsum = fma(sx + x >= 0 && sx + x < src_cols ? + convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum); + } + sum = fma(xsum, coeffs_y[y], sum); + } + else + sum = fma(scalar, coeffs_y[y], sum); + } + + storepix(convertToT(sum), dstptr + dst_index); +#endif } } } #elif defined INTER_CUBIC +#ifdef AMD_DEVICE + inline void interpolateCubic( float x, float* coeffs ) { const float A = -0.75f; @@ -199,6 +247,23 @@ inline void interpolateCubic( float x, float* coeffs ) coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; } +#else + +__constant float coeffs[128] = + { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f, + -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f, + -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f, + -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f, + -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f, + -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f, + -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f, + -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f, + -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f, + -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f, + -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f }; + +#endif + __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __constant CT * M, ST scalar_) @@ -208,22 +273,17 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of if (dx < dst_cols && dy < dst_rows) { - int round_delta = ((AB_SCALE>>INTER_BITS)>>1); - int tmp = (dx << AB_BITS); - int X0 = rint(M[0] * tmp); - int Y0 = rint(M[3] * tmp); + int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA; + int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA; - X0 += rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; - Y0 += rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; X0 = X0 >> (AB_BITS - INTER_BITS); Y0 = Y0 >> (AB_BITS - INTER_BITS); - int sx = (short)(X0 >> INTER_BITS) - 1; - int sy = (short)(Y0 >> INTER_BITS) - 1; - int ay = (short)(Y0 & (INTER_TAB_SIZE-1)); - int ax = (short)(X0 & (INTER_TAB_SIZE-1)); + int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1; + int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1)); +#ifdef AMD_DEVICE WT v[16]; #pragma unroll for (int y = 0; y < 4; y++) @@ -270,6 +330,46 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum); storepix(convertToT( sum ), dstptr + dst_index); #endif +#else // INTEL_DEVICE + __constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2); + + int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index; + int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); + + WT sum = (WT)(0), xsum; + #pragma unroll + for (int y = 0; y < 4; y++) + { + src_index = mad24(y, src_step, src_index0); + if (sy + y >= 0 && sy + y < src_rows) + { + xsum = (WT)(0); + if (sx >= 0 && sx + 4 < src_cols) + { +#if depth == 0 && cn == 1 + uchar4 value = vload4(0, srcptr + src_index); + xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); +#else + #pragma unroll + for (int x = 0; x < 4; x++) + xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum); +#endif + } + else + { + #pragma unroll + for (int x = 0; x < 4; x++) + xsum = fma(sx + x >= 0 && sx + x < src_cols ? + convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum); + } + sum = fma(xsum, coeffs_y[y], sum); + } + else + sum = fma(scalar, coeffs_y[y], sum); + } + + storepix(convertToT(sum), dstptr + dst_index); +#endif } }