From b72d19675353b909ac5344e4d64495962b31fc13 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Wed, 23 Nov 2016 12:57:33 +0800
Subject: [PATCH] optimization for warpAffine and warpPerspective

Add new ocl kernels for warpAffine and warpPerspective,
The average performance improvemnt is about 30%. The new
ocl kernels require CV_8UC1 format and support nearest
neighbor and bilinear interpolation.

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/imgproc/src/imgwarp.cpp              |  85 +++++++++++
 modules/imgproc/src/opencl/warp_transform.cl | 214 +++++++++++++++++++++++++++
 modules/imgproc/test/ocl/test_warp.cpp       | 106 +++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 modules/imgproc/src/opencl/warp_transform.cl

diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 0d0fb72..943a515 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -5647,6 +5647,81 @@ private:
 
 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
 
+static bool ocl_warpTransform_cols4(InputArray _src, OutputArray _dst, InputArray _M0,
+                                    Size dsize, int flags, int borderType, const Scalar& borderValue,
+                                    int op_type)
+{
+    CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int type = _src.type(), dtype = _dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    int interpolation = flags & INTER_MAX;
+    if( interpolation == INTER_AREA )
+        interpolation = INTER_LINEAR;
+
+    if ( !dev.isIntel() || !(type == CV_8UC1) ||
+         !(dtype == CV_8UC1) || !(_dst.cols() % 4 == 0) ||
+         !(borderType == cv::BORDER_CONSTANT &&
+          (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR)))
+        return false;
+
+    const char * const warp_op[2] = { "Affine", "Perspective" };
+    const char * const interpolationMap[3] = { "nearest", "linear", NULL };
+    ocl::ProgramSource program = ocl::imgproc::warp_transform_oclsrc;
+    String kernelName = format("warp%s_%s_8u", warp_op[op_type], interpolationMap[interpolation]);
+
+    bool is32f = (interpolation == INTER_LINEAR);
+    int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
+    int sctype = CV_MAKETYPE(wdepth, cn);
+
+    ocl::Kernel k;
+    String opts = format("-D ST=%s", ocl::typeToStr(sctype));
+
+    k.create(kernelName.c_str(), program, opts);
+    if (k.empty())
+        return false;
+
+    float borderBuf[] = { 0, 0, 0, 0 };
+    scalarToRawData(borderValue, borderBuf, sctype);
+
+    UMat src = _src.getUMat(), M0;
+    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
+    UMat dst = _dst.getUMat();
+
+    float M[9];
+    int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
+    Mat matM(matRows, 3, CV_32F, M), M1 = _M0.getMat();
+    CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && M1.rows == matRows && M1.cols == 3 );
+    M1.convertTo(matM, matM.type());
+
+    if( !(flags & WARP_INVERSE_MAP) )
+    {
+        if (op_type == OCL_OP_PERSPECTIVE)
+            invert(matM, matM);
+        else
+        {
+            float D = M[0]*M[4] - M[1]*M[3];
+            D = D != 0 ? 1.f/D : 0;
+            float A11 = M[4]*D, A22=M[0]*D;
+            M[0] = A11; M[1] *= -D;
+            M[3] *= -D; M[4] = A22;
+            float b1 = -M[0]*M[2] - M[1]*M[5];
+            float b2 = -M[3]*M[2] - M[4]*M[5];
+            M[2] = b1; M[5] = b2;
+        }
+    }
+    matM.convertTo(M0, CV_32F);
+
+    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
+           ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
+
+    size_t globalThreads[2];
+    globalThreads[0] = (size_t)(dst.cols / 4);
+    globalThreads[1] = (size_t)dst.rows;
+
+    return k.run(2, globalThreads, NULL, false);
+}
+
 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
                               Size dsize, int flags, int borderType, const Scalar& borderValue,
                               int op_type)
@@ -5786,6 +5861,11 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
 {
     CV_INSTRUMENT_REGION()
 
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
+               _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
+               ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
+                                       borderValue, OCL_OP_AFFINE))
+
     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
                                  borderValue, OCL_OP_AFFINE))
@@ -6312,6 +6392,11 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
 
     CV_Assert( _src.total() > 0 );
 
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
+               _src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
+               ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType, borderValue,
+                                       OCL_OP_PERSPECTIVE))
+
     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
                               OCL_OP_PERSPECTIVE))
diff --git a/modules/imgproc/src/opencl/warp_transform.cl b/modules/imgproc/src/opencl/warp_transform.cl
new file mode 100644
index 0000000..bfe35f6
--- /dev/null
+++ b/modules/imgproc/src/opencl/warp_transform.cl
@@ -0,0 +1,214 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+__constant short4 vec_offset = (short4)(0, 1, 2, 3);
+
+#define GET_VAL(x, y) ((x) < 0 || (x) >= src_cols || (y) < 0 || (y) >= src_rows) ? scalar : src[src_offset + y * src_step + x]
+
+__kernel void warpAffine_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
+                                    __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                    __constant float * M, ST scalar_)
+{
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    uchar scalar = convert_uchar_sat_rte(scalar_);
+
+    if (x >= dst_cols || y >= dst_rows) return;
+
+    /* { M0, M1, M2 }
+     * { M3, M4, M5 }
+     */
+
+    short4 new_x, new_y;
+    new_x = convert_short4_sat_rte(M[0] * convert_float4(vec_offset + (short4)(x)) +
+                                   M[1] * convert_float4((short4)y) + M[2]);
+
+    new_y = convert_short4_sat_rte(M[3] * convert_float4(vec_offset + (short4)(x)) +
+                                   M[4] * convert_float4((short4)y) + M[5]);
+
+    uchar4 pix = (uchar4)scalar;
+
+    pix.s0 = GET_VAL(new_x.s0, new_y.s0);
+    pix.s1 = GET_VAL(new_x.s1, new_y.s1);
+    pix.s2 = GET_VAL(new_x.s2, new_y.s2);
+    pix.s3 = GET_VAL(new_x.s3, new_y.s3);
+
+    int dst_index = x + y * dst_step + dst_offset;
+
+    vstore4(pix, 0,  dst + dst_index);
+}
+
+uchar4 read_pixels(__global const uchar * src, short tx, short ty,
+                   int src_offset, int src_step, int src_cols, int
+                   src_rows, uchar scalar)
+{
+    uchar2 pt, pb;
+    short bx, by;
+
+    bx = tx + 1;
+    by = ty + 1;
+
+    if (tx >= 0 && (tx + 1) < src_cols && ty >= 0 && ty < src_rows)
+    {
+        pt = vload2(0, src + src_offset + ty * src_step + tx);
+    }
+    else
+    {
+        pt.s0 = GET_VAL(tx, ty);
+        pt.s1 = GET_VAL(bx, ty);
+    }
+
+    if (tx >= 0 && (tx + 1) < src_cols && by >= 0 && by < src_rows)
+    {
+        pb = vload2(0, src + src_offset + by * src_step + tx);
+    }
+    else
+    {
+        pb.s0 = GET_VAL(tx, by);
+        pb.s1 = GET_VAL(bx, by);
+    }
+
+    return (uchar4)(pt, pb);
+}
+
+__kernel void warpAffine_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
+                                   __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __constant float * M, ST scalar_)
+{
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    uchar scalar = convert_uchar_sat_rte(scalar_);
+
+    if (x >= dst_cols || y >= dst_rows) return;
+
+    /* { M0, M1, M2 }
+     * { M3, M4, M5 }
+     */
+
+    float4 nx, ny;
+    nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2];
+    ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5];
+
+    float4 s, t;
+    s = round((nx - floor(nx)) * 32.0f) / 32.0f;
+    t = round((ny - floor(ny)) * 32.0f) / 32.0f;
+
+    short4 tx, ty;
+    tx = convert_short4_sat_rtn(nx);
+    ty = convert_short4_sat_rtn(ny);
+
+    uchar4 pix[4];
+    pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar);
+    pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar);
+    pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar);
+    pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar);
+
+    float4 tl, tr, bl, br;
+    tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0));
+    tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1));
+    bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2));
+    br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3));
+
+    float4 pixel;
+    pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t;
+
+    int dst_index = x + y * dst_step + dst_offset;
+    vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index);
+}
+
+__kernel void warpPerspective_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
+                                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                         __constant float * M, ST scalar_)
+{
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    uchar scalar = convert_uchar_sat_rte(scalar_);
+
+    if (x >= dst_cols || y >= dst_rows) return;
+
+    /* { M0, M1, M2 }
+     * { M3, M4, M5 }
+     * { M6, M7, M8 }
+     */
+
+    float4 nx, ny, nz;
+    nx = M[0] * convert_float4(vec_offset + (short4)(x)) +
+         M[1] * convert_float4((short4)y) + M[2];
+
+    ny = M[3] * convert_float4(vec_offset + (short4)(x)) +
+         M[4] * convert_float4((short4)y) + M[5];
+
+    nz = M[6] * convert_float4(vec_offset + (short4)(x)) +
+         M[7] * convert_float4((short4)y) + M[8];
+
+    short4 new_x, new_y;
+    float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
+    new_x = convert_short4_sat_rte(nx * fz);
+    new_y = convert_short4_sat_rte(ny * fz);
+
+    uchar4 pix = (uchar4)scalar;
+
+    pix.s0 = GET_VAL(new_x.s0, new_y.s0);
+    pix.s1 = GET_VAL(new_x.s1, new_y.s1);
+    pix.s2 = GET_VAL(new_x.s2, new_y.s2);
+    pix.s3 = GET_VAL(new_x.s3, new_y.s3);
+
+    int dst_index = x + y * dst_step + dst_offset;
+
+    vstore4(pix, 0,  dst + dst_index);
+}
+
+__kernel void warpPerspective_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
+                                        __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                        __constant float * M, ST scalar_)
+{
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    uchar scalar = convert_uchar_sat_rte(scalar_);
+
+    if (x >= dst_cols || y >= dst_rows) return;
+
+    /* { M0, M1, M2 }
+     * { M3, M4, M5 }
+     * { M6, M7, M8 }
+     */
+
+    float4 nx, ny, nz;
+    nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2];
+
+    ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5];
+
+    nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8];
+
+    float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f);
+
+    nx = nx * fz;
+    ny = ny * fz;
+
+    float4 s, t;
+    s = round((nx - floor(nx)) * 32.0f) / (float4)32.0f;
+    t = round((ny - floor(ny)) * 32.0f) / (float4)32.0f;
+
+    short4 tx, ty;
+    tx = convert_short4_sat_rtn(nx);
+    ty = convert_short4_sat_rtn(ny);
+
+    uchar4 pix[4];
+    pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar);
+    pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar);
+    pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar);
+    pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar);
+
+    float4 tl, tr, bl, br;
+    tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0));
+    tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1));
+    bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2));
+    br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3));
+
+    float4 pixel;
+    pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t;
+
+    int dst_index = x + y * dst_step + dst_offset;
+    vstore4(convert_uchar4_sat_rte(pixel), 0,  dst + dst_index);
+}
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index da70f73..8534658 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -113,6 +113,57 @@ PARAM_TEST_CASE(WarpTestBase, MatType, Interpolation, bool, bool)
     }
 };
 
+PARAM_TEST_CASE(WarpTest_cols4_Base, MatType, Interpolation, bool, bool)
+{
+    int type, interpolation;
+    Size dsize;
+    bool useRoi, mapInverse;
+    int depth;
+
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
+
+    virtual void SetUp()
+    {
+        type = GET_PARAM(0);
+        interpolation = GET_PARAM(1);
+        mapInverse = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+        depth = CV_MAT_DEPTH(type);
+
+        if (mapInverse)
+            interpolation |= WARP_INVERSE_MAP;
+    }
+
+    void random_roi()
+    {
+        dsize = randomSize(1, MAX_VALUE);
+        dsize.width = ((dsize.width >> 2) + 1) * 4;
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+    }
+
+    void Near(double threshold = 0.0)
+    {
+        if (depth < CV_32F)
+            EXPECT_MAT_N_DIFF(dst_roi, udst_roi, cvRound(dst_roi.total()*threshold));
+        else
+            OCL_EXPECT_MATS_NEAR_RELATIVE(dst, threshold);
+    }
+};
+
+/////warpAffine
+
+typedef WarpTestBase WarpAffine;
+
 /////warpAffine
 
 typedef WarpTestBase WarpAffine;
@@ -134,6 +185,25 @@ OCL_TEST_P(WarpAffine, Mat)
     }
 }
 
+typedef WarpTest_cols4_Base WarpAffine_cols4;
+
+OCL_TEST_P(WarpAffine_cols4, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        double eps = depth < CV_32F ? 0.04 : 0.06;
+        random_roi();
+
+        Mat M = getRotationMatrix2D(Point2f(src_roi.cols / 2.0f, src_roi.rows / 2.0f),
+            rng.uniform(-180.f, 180.f), rng.uniform(0.4f, 2.0f));
+
+        OCL_OFF(cv::warpAffine(src_roi, dst_roi, M, dsize, interpolation));
+        OCL_ON(cv::warpAffine(usrc_roi, udst_roi, M, dsize, interpolation));
+
+        Near(eps);
+    }
+}
+
 //// warpPerspective
 
 typedef WarpTestBase WarpPerspective;
@@ -161,6 +231,30 @@ OCL_TEST_P(WarpPerspective, Mat)
     }
 }
 
+typedef WarpTest_cols4_Base WarpPerspective_cols4;
+
+OCL_TEST_P(WarpPerspective_cols4, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        double eps = depth < CV_32F ? 0.03 : 0.06;
+        random_roi();
+
+        float cols = static_cast<float>(src_roi.cols), rows = static_cast<float>(src_roi.rows);
+        float cols2 = cols / 2.0f, rows2 = rows / 2.0f;
+        Point2f sp[] = { Point2f(0.0f, 0.0f), Point2f(cols, 0.0f), Point2f(0.0f, rows), Point2f(cols, rows) };
+        Point2f dp[] = { Point2f(rng.uniform(0.0f, cols2), rng.uniform(0.0f, rows2)),
+            Point2f(rng.uniform(cols2, cols), rng.uniform(0.0f, rows2)),
+            Point2f(rng.uniform(0.0f, cols2), rng.uniform(rows2, rows)),
+            Point2f(rng.uniform(cols2, cols), rng.uniform(rows2, rows)) };
+        Mat M = getPerspectiveTransform(sp, dp);
+
+        OCL_OFF(cv::warpPerspective(src_roi, dst_roi, M, dsize, interpolation));
+        OCL_ON(cv::warpPerspective(usrc_roi, udst_roi, M, dsize, interpolation));
+
+        Near(eps);
+    }
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //// resize
@@ -341,12 +435,24 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpAffine, Combine(
                             Bool(),
                             Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpAffine_cols4, Combine(
+                            Values((MatType)CV_8UC1),
+                            Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
+                            Bool(),
+                            Bool()));
+
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpPerspective, Combine(
                             Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                             Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR, (Interpolation)INTER_CUBIC),
                             Bool(),
                             Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpPerspective_cols4, Combine(
+                            Values((MatType)CV_8UC1),
+                            Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
+                            Bool(),
+                            Bool()));
+
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Resize, Combine(
                             Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, CV_32FC4),
                             Values(0.5, 1.5, 2.0, 0.2),
-- 
2.7.4