From b72d19675353b909ac5344e4d64495962b31fc13 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Wed, 23 Nov 2016 12:57:33 +0800 Subject: [PATCH] optimization for warpAffine and warpPerspective Add new ocl kernels for warpAffine and warpPerspective, The average performance improvemnt is about 30%. The new ocl kernels require CV_8UC1 format and support nearest neighbor and bilinear interpolation. Signed-off-by: Li Peng --- modules/imgproc/src/imgwarp.cpp | 85 +++++++++++ modules/imgproc/src/opencl/warp_transform.cl | 214 +++++++++++++++++++++++++++ modules/imgproc/test/ocl/test_warp.cpp | 106 +++++++++++++ 3 files changed, 405 insertions(+) create mode 100644 modules/imgproc/src/opencl/warp_transform.cl diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 0d0fb72..943a515 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -5647,6 +5647,81 @@ private: enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 }; +static bool ocl_warpTransform_cols4(InputArray _src, OutputArray _dst, InputArray _M0, + Size dsize, int flags, int borderType, const Scalar& borderValue, + int op_type) +{ + CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE); + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), dtype = _dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + int interpolation = flags & INTER_MAX; + if( interpolation == INTER_AREA ) + interpolation = INTER_LINEAR; + + if ( !dev.isIntel() || !(type == CV_8UC1) || + !(dtype == CV_8UC1) || !(_dst.cols() % 4 == 0) || + !(borderType == cv::BORDER_CONSTANT && + (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR))) + return false; + + const char * const warp_op[2] = { "Affine", "Perspective" }; + const char * const interpolationMap[3] = { "nearest", "linear", NULL }; + ocl::ProgramSource program = ocl::imgproc::warp_transform_oclsrc; + String kernelName = format("warp%s_%s_8u", warp_op[op_type], interpolationMap[interpolation]); + + bool is32f = (interpolation == INTER_LINEAR); + int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth); + int sctype = CV_MAKETYPE(wdepth, cn); + + ocl::Kernel k; + String opts = format("-D ST=%s", ocl::typeToStr(sctype)); + + k.create(kernelName.c_str(), program, opts); + if (k.empty()) + return false; + + float borderBuf[] = { 0, 0, 0, 0 }; + scalarToRawData(borderValue, borderBuf, sctype); + + UMat src = _src.getUMat(), M0; + _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); + UMat dst = _dst.getUMat(); + + float M[9]; + int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3); + Mat matM(matRows, 3, CV_32F, M), M1 = _M0.getMat(); + CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && M1.rows == matRows && M1.cols == 3 ); + M1.convertTo(matM, matM.type()); + + if( !(flags & WARP_INVERSE_MAP) ) + { + if (op_type == OCL_OP_PERSPECTIVE) + invert(matM, matM); + else + { + float D = M[0]*M[4] - M[1]*M[3]; + D = D != 0 ? 1.f/D : 0; + float A11 = M[4]*D, A22=M[0]*D; + M[0] = A11; M[1] *= -D; + M[3] *= -D; M[4] = A22; + float b1 = -M[0]*M[2] - M[1]*M[5]; + float b2 = -M[3]*M[2] - M[4]*M[5]; + M[2] = b1; M[5] = b2; + } + } + matM.convertTo(M0, CV_32F); + + k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0), + ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); + + size_t globalThreads[2]; + globalThreads[0] = (size_t)(dst.cols / 4); + globalThreads[1] = (size_t)dst.rows; + + return k.run(2, globalThreads, NULL, false); +} + static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, Size dsize, int flags, int borderType, const Scalar& borderValue, int op_type) @@ -5786,6 +5861,11 @@ void cv::warpAffine( InputArray _src, OutputArray _dst, { CV_INSTRUMENT_REGION() + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && + _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX, + ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, + borderValue, OCL_OP_AFFINE)) + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue, OCL_OP_AFFINE)) @@ -6312,6 +6392,11 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0, CV_Assert( _src.total() > 0 ); + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && + _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX, + ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, borderValue, + OCL_OP_PERSPECTIVE)) + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue, OCL_OP_PERSPECTIVE)) diff --git a/modules/imgproc/src/opencl/warp_transform.cl b/modules/imgproc/src/opencl/warp_transform.cl new file mode 100644 index 0000000..bfe35f6 --- /dev/null +++ b/modules/imgproc/src/opencl/warp_transform.cl @@ -0,0 +1,214 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +__constant short4 vec_offset = (short4)(0, 1, 2, 3); + +#define GET_VAL(x, y) ((x) < 0 || (x) >= src_cols || (y) < 0 || (y) >= src_rows) ? scalar : src[src_offset + y * src_step + x] + +__kernel void warpAffine_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * M, ST scalar_) +{ + int x = get_global_id(0) * 4; + int y = get_global_id(1); + uchar scalar = convert_uchar_sat_rte(scalar_); + + if (x >= dst_cols || y >= dst_rows) return; + + /* { M0, M1, M2 } + * { M3, M4, M5 } + */ + + short4 new_x, new_y; + new_x = convert_short4_sat_rte(M[0] * convert_float4(vec_offset + (short4)(x)) + + M[1] * convert_float4((short4)y) + M[2]); + + new_y = convert_short4_sat_rte(M[3] * convert_float4(vec_offset + (short4)(x)) + + M[4] * convert_float4((short4)y) + M[5]); + + uchar4 pix = (uchar4)scalar; + + pix.s0 = GET_VAL(new_x.s0, new_y.s0); + pix.s1 = GET_VAL(new_x.s1, new_y.s1); + pix.s2 = GET_VAL(new_x.s2, new_y.s2); + pix.s3 = GET_VAL(new_x.s3, new_y.s3); + + int dst_index = x + y * dst_step + dst_offset; + + vstore4(pix, 0, dst + dst_index); +} + +uchar4 read_pixels(__global const uchar * src, short tx, short ty, + int src_offset, int src_step, int src_cols, int + src_rows, uchar scalar) +{ + uchar2 pt, pb; + short bx, by; + + bx = tx + 1; + by = ty + 1; + + if (tx >= 0 && (tx + 1) < src_cols && ty >= 0 && ty < src_rows) + { + pt = vload2(0, src + src_offset + ty * src_step + tx); + } + else + { + pt.s0 = GET_VAL(tx, ty); + pt.s1 = GET_VAL(bx, ty); + } + + if (tx >= 0 && (tx + 1) < src_cols && by >= 0 && by < src_rows) + { + pb = vload2(0, src + src_offset + by * src_step + tx); + } + else + { + pb.s0 = GET_VAL(tx, by); + pb.s1 = GET_VAL(bx, by); + } + + return (uchar4)(pt, pb); +} + +__kernel void warpAffine_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * M, ST scalar_) +{ + int x = get_global_id(0) * 4; + int y = get_global_id(1); + uchar scalar = convert_uchar_sat_rte(scalar_); + + if (x >= dst_cols || y >= dst_rows) return; + + /* { M0, M1, M2 } + * { M3, M4, M5 } + */ + + float4 nx, ny; + nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2]; + ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5]; + + float4 s, t; + s = round((nx - floor(nx)) * 32.0f) / 32.0f; + t = round((ny - floor(ny)) * 32.0f) / 32.0f; + + short4 tx, ty; + tx = convert_short4_sat_rtn(nx); + ty = convert_short4_sat_rtn(ny); + + uchar4 pix[4]; + pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar); + pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar); + pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar); + pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar); + + float4 tl, tr, bl, br; + tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0)); + tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1)); + bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2)); + br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3)); + + float4 pixel; + pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t; + + int dst_index = x + y * dst_step + dst_offset; + vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index); +} + +__kernel void warpPerspective_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * M, ST scalar_) +{ + int x = get_global_id(0) * 4; + int y = get_global_id(1); + uchar scalar = convert_uchar_sat_rte(scalar_); + + if (x >= dst_cols || y >= dst_rows) return; + + /* { M0, M1, M2 } + * { M3, M4, M5 } + * { M6, M7, M8 } + */ + + float4 nx, ny, nz; + nx = M[0] * convert_float4(vec_offset + (short4)(x)) + + M[1] * convert_float4((short4)y) + M[2]; + + ny = M[3] * convert_float4(vec_offset + (short4)(x)) + + M[4] * convert_float4((short4)y) + M[5]; + + nz = M[6] * convert_float4(vec_offset + (short4)(x)) + + M[7] * convert_float4((short4)y) + M[8]; + + short4 new_x, new_y; + float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f); + new_x = convert_short4_sat_rte(nx * fz); + new_y = convert_short4_sat_rte(ny * fz); + + uchar4 pix = (uchar4)scalar; + + pix.s0 = GET_VAL(new_x.s0, new_y.s0); + pix.s1 = GET_VAL(new_x.s1, new_y.s1); + pix.s2 = GET_VAL(new_x.s2, new_y.s2); + pix.s3 = GET_VAL(new_x.s3, new_y.s3); + + int dst_index = x + y * dst_step + dst_offset; + + vstore4(pix, 0, dst + dst_index); +} + +__kernel void warpPerspective_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float * M, ST scalar_) +{ + int x = get_global_id(0) * 4; + int y = get_global_id(1); + uchar scalar = convert_uchar_sat_rte(scalar_); + + if (x >= dst_cols || y >= dst_rows) return; + + /* { M0, M1, M2 } + * { M3, M4, M5 } + * { M6, M7, M8 } + */ + + float4 nx, ny, nz; + nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2]; + + ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5]; + + nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8]; + + float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f); + + nx = nx * fz; + ny = ny * fz; + + float4 s, t; + s = round((nx - floor(nx)) * 32.0f) / (float4)32.0f; + t = round((ny - floor(ny)) * 32.0f) / (float4)32.0f; + + short4 tx, ty; + tx = convert_short4_sat_rtn(nx); + ty = convert_short4_sat_rtn(ny); + + uchar4 pix[4]; + pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar); + pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar); + pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar); + pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar); + + float4 tl, tr, bl, br; + tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0)); + tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1)); + bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2)); + br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3)); + + float4 pixel; + pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t; + + int dst_index = x + y * dst_step + dst_offset; + vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index); +} diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp index da70f73..8534658 100644 --- a/modules/imgproc/test/ocl/test_warp.cpp +++ b/modules/imgproc/test/ocl/test_warp.cpp @@ -113,6 +113,57 @@ PARAM_TEST_CASE(WarpTestBase, MatType, Interpolation, bool, bool) } }; +PARAM_TEST_CASE(WarpTest_cols4_Base, MatType, Interpolation, bool, bool) +{ + int type, interpolation; + Size dsize; + bool useRoi, mapInverse; + int depth; + + TEST_DECLARE_INPUT_PARAMETER(src); + TEST_DECLARE_OUTPUT_PARAMETER(dst); + + virtual void SetUp() + { + type = GET_PARAM(0); + interpolation = GET_PARAM(1); + mapInverse = GET_PARAM(2); + useRoi = GET_PARAM(3); + depth = CV_MAT_DEPTH(type); + + if (mapInverse) + interpolation |= WARP_INVERSE_MAP; + } + + void random_roi() + { + dsize = randomSize(1, MAX_VALUE); + dsize.width = ((dsize.width >> 2) + 1) * 4; + + Size roiSize = randomSize(1, MAX_VALUE); + Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); + randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); + + Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); + randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE); + + UMAT_UPLOAD_INPUT_PARAMETER(src); + UMAT_UPLOAD_OUTPUT_PARAMETER(dst); + } + + void Near(double threshold = 0.0) + { + if (depth < CV_32F) + EXPECT_MAT_N_DIFF(dst_roi, udst_roi, cvRound(dst_roi.total()*threshold)); + else + OCL_EXPECT_MATS_NEAR_RELATIVE(dst, threshold); + } +}; + +/////warpAffine + +typedef WarpTestBase WarpAffine; + /////warpAffine typedef WarpTestBase WarpAffine; @@ -134,6 +185,25 @@ OCL_TEST_P(WarpAffine, Mat) } } +typedef WarpTest_cols4_Base WarpAffine_cols4; + +OCL_TEST_P(WarpAffine_cols4, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + double eps = depth < CV_32F ? 0.04 : 0.06; + random_roi(); + + Mat M = getRotationMatrix2D(Point2f(src_roi.cols / 2.0f, src_roi.rows / 2.0f), + rng.uniform(-180.f, 180.f), rng.uniform(0.4f, 2.0f)); + + OCL_OFF(cv::warpAffine(src_roi, dst_roi, M, dsize, interpolation)); + OCL_ON(cv::warpAffine(usrc_roi, udst_roi, M, dsize, interpolation)); + + Near(eps); + } +} + //// warpPerspective typedef WarpTestBase WarpPerspective; @@ -161,6 +231,30 @@ OCL_TEST_P(WarpPerspective, Mat) } } +typedef WarpTest_cols4_Base WarpPerspective_cols4; + +OCL_TEST_P(WarpPerspective_cols4, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + double eps = depth < CV_32F ? 0.03 : 0.06; + random_roi(); + + float cols = static_cast(src_roi.cols), rows = static_cast(src_roi.rows); + float cols2 = cols / 2.0f, rows2 = rows / 2.0f; + Point2f sp[] = { Point2f(0.0f, 0.0f), Point2f(cols, 0.0f), Point2f(0.0f, rows), Point2f(cols, rows) }; + Point2f dp[] = { Point2f(rng.uniform(0.0f, cols2), rng.uniform(0.0f, rows2)), + Point2f(rng.uniform(cols2, cols), rng.uniform(0.0f, rows2)), + Point2f(rng.uniform(0.0f, cols2), rng.uniform(rows2, rows)), + Point2f(rng.uniform(cols2, cols), rng.uniform(rows2, rows)) }; + Mat M = getPerspectiveTransform(sp, dp); + + OCL_OFF(cv::warpPerspective(src_roi, dst_roi, M, dsize, interpolation)); + OCL_ON(cv::warpPerspective(usrc_roi, udst_roi, M, dsize, interpolation)); + + Near(eps); + } +} ///////////////////////////////////////////////////////////////////////////////////////////////// //// resize @@ -341,12 +435,24 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpAffine, Combine( Bool(), Bool())); +OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpAffine_cols4, Combine( + Values((MatType)CV_8UC1), + Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR), + Bool(), + Bool())); + OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpPerspective, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4), Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR, (Interpolation)INTER_CUBIC), Bool(), Bool())); +OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpPerspective_cols4, Combine( + Values((MatType)CV_8UC1), + Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR), + Bool(), + Bool())); + OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Resize, Combine( Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, CV_32FC4), Values(0.5, 1.5, 2.0, 0.2), -- 2.7.4