From: Aaron Kunze <aaron.kunze@intel.com>
Date: Mon, 14 Apr 2014 23:09:17 +0000 (-0700)
Subject: Optimizes OpenCL resize and optical flow to use image extension.
X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~463^2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ebbac3c66d6ad6f9bb1eb3932e6aad5a1c42ae98;p=profile%2Fivi%2Fopencv.git

Optimizes OpenCL resize and optical flow to use image extension.
---

diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index fdb6f9a..63cb926 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -151,6 +151,10 @@ public:
 
     bool imageSupport() const;
 
+    bool imageFromBufferSupport() const;
+    uint imagePitchAlignment() const;
+    uint imageBaseAddressAlignment() const;
+
     size_t image2DMaxWidth() const;
     size_t image2DMaxHeight() const;
 
@@ -602,12 +606,25 @@ class CV_EXPORTS Image2D
 {
 public:
     Image2D();
-    explicit Image2D(const UMat &src);
+
+    // src:     The UMat from which to get image properties and data
+    // norm:    Flag to enable the use of normalized channel data types
+    // alias:   Flag indicating that the image should alias the src UMat.
+    //          If true, changes to the image or src will be reflected in
+    //          both objects.
+    explicit Image2D(const UMat &src, bool norm = false, bool alias = false);
     Image2D(const Image2D & i);
     ~Image2D();
 
     Image2D & operator = (const Image2D & i);
 
+    // Indicates if creating an aliased image should succeed.  Depends on the
+    // underlying platform and the dimensions of the UMat.
+    static bool canCreateAlias(const UMat &u);
+
+    // Indicates if the image format is supported.
+    static bool isFormatSupported(int depth, int cn, bool norm);
+
     void* ptr() const;
 protected:
     struct Impl;
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 712bbd9..c5a6b22 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -882,7 +882,6 @@ OCL_FUNC_P(cl_mem, clCreateImage2D,
     cl_int *errcode_ret),
     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
 
-/*
 OCL_FUNC(cl_int, clGetSupportedImageFormats,
  (cl_context context,
  cl_mem_flags flags,
@@ -892,6 +891,7 @@ OCL_FUNC(cl_int, clGetSupportedImageFormats,
  cl_uint * num_image_formats),
  (context, flags, image_type, num_entries, image_formats, num_image_formats))
 
+/*
 OCL_FUNC(cl_int, clGetMemObjectInfo,
  (cl_mem memobj,
  cl_mem_info param_name,
@@ -1912,6 +1912,38 @@ bool Device::hostUnifiedMemory() const
 bool Device::imageSupport() const
 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
 
+bool Device::imageFromBufferSupport() const
+{
+    bool ret = false;
+    if (p)
+    {
+        size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
+        if (pos != String::npos)
+        {
+            ret = true;
+        }
+    }
+    return ret;
+}
+
+uint Device::imagePitchAlignment() const
+{
+#ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
+    return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
+#else
+    return 0;
+#endif
+}
+
+uint Device::imageBaseAddressAlignment() const
+{
+#ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
+    return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
+#else
+    return 0;
+#endif
+}
+
 size_t Device::image2DMaxWidth() const
 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
 
@@ -2705,9 +2737,15 @@ struct Kernel::Impl
             haveTempDstUMats = true;
     }
 
+    void addImage(const Image2D& image)
+    {
+        images.push_back(image);
+    }
+
     void finit()
     {
         cleanupUMats();
+        images.clear();
         if(e) { clReleaseEvent(e); e = 0; }
         release();
     }
@@ -2725,6 +2763,7 @@ struct Kernel::Impl
     enum { MAX_ARRS = 16 };
     UMatData* u[MAX_ARRS];
     int nu;
+    std::list<Image2D> images;
     bool haveTempDstUMats;
 };
 
@@ -2838,6 +2877,7 @@ int Kernel::set(int i, const void* value, size_t sz)
 
 int Kernel::set(int i, const Image2D& image2D)
 {
+    p->addImage(image2D);
     cl_mem h = (cl_mem)image2D.ptr();
     return set(i, &h, sizeof(h));
 }
@@ -4408,11 +4448,11 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
 
 struct Image2D::Impl
 {
-    Impl(const UMat &src)
+    Impl(const UMat &src, bool norm, bool alias)
     {
         handle = 0;
         refcount = 1;
-        init(src);
+        init(src, norm, alias);
     }
 
     ~Impl()
@@ -4421,24 +4461,55 @@ struct Image2D::Impl
             clReleaseMemObject(handle);
     }
 
-    void init(const UMat &src)
+    static cl_image_format getImageFormat(int depth, int cn, bool norm)
     {
-        CV_Assert(ocl::Device::getDefault().imageSupport());
-
         cl_image_format format;
-        int err, depth = src.depth(), cn = src.channels();
-        CV_Assert(cn <= 4);
-
         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
+        static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
+                                                CL_SNORM_INT16, -1, -1, -1, -1 };
         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
 
-        int channelType = channelTypes[depth], channelOrder = channelOrders[cn];
-        if (channelType < 0 || channelOrder < 0)
-            CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
-
+        int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
+        int channelOrder = channelOrders[cn];
         format.image_channel_data_type = (cl_channel_type)channelType;
         format.image_channel_order = (cl_channel_order)channelOrder;
+        return format;
+    }
+
+    static bool isFormatSupported(cl_image_format format)
+    {
+        cl_context context = (cl_context)Context::getDefault().ptr();
+        // Figure out how many formats are supported by this context.
+        cl_uint numFormats = 0;
+        cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
+                                                CL_MEM_OBJECT_IMAGE2D, numFormats,
+                                                NULL, &numFormats);
+        AutoBuffer<cl_image_format> formats(numFormats);
+        err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
+                                         CL_MEM_OBJECT_IMAGE2D, numFormats,
+                                         formats, NULL);
+        CV_OclDbgAssert(err == CL_SUCCESS);
+        for (cl_uint i = 0; i < numFormats; ++i)
+        {
+            if (!memcmp(&formats[i], &format, sizeof(format)))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void init(const UMat &src, bool norm, bool alias)
+    {
+        CV_Assert(ocl::Device::getDefault().imageSupport());
+
+        int err, depth = src.depth(), cn = src.channels();
+        CV_Assert(cn <= 4);
+        cl_image_format format = getImageFormat(depth, cn, norm);
+
+        if (!isFormatSupported(format))
+            CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
 
         cl_context context = (cl_context)Context::getDefault().ptr();
         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
@@ -4448,6 +4519,7 @@ struct Image2D::Impl
         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
         const Device & d = ocl::Device::getDefault();
         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
+        CV_Assert(!alias || canCreateAlias(src));
         if (1 < major || (1 == major && 2 <= minor))
         {
             cl_image_desc desc;
@@ -4456,9 +4528,9 @@ struct Image2D::Impl
             desc.image_height     = src.rows;
             desc.image_depth      = 0;
             desc.image_array_size = 1;
-            desc.image_row_pitch  = 0;
+            desc.image_row_pitch  = alias ? src.step[0] : 0;
             desc.image_slice_pitch = 0;
-            desc.buffer           = NULL;
+            desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
             desc.num_mip_levels   = 0;
             desc.num_samples      = 0;
             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
@@ -4467,6 +4539,7 @@ struct Image2D::Impl
 #endif
         {
             CV_SUPPRESS_DEPRECATED_START
+            CV_Assert(!alias);  // This is an OpenCL 1.2 extension
             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
             CV_SUPPRESS_DEPRECATED_END
         }
@@ -4476,7 +4549,7 @@ struct Image2D::Impl
         size_t region[] = { src.cols, src.rows, 1 };
 
         cl_mem devData;
-        if (!src.isContinuous())
+        if (!alias && !src.isContinuous())
         {
             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
             CV_OclDbgAssert(err == CL_SUCCESS);
@@ -4487,14 +4560,19 @@ struct Image2D::Impl
             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
         }
         else
+        {
             devData = (cl_mem)src.handle(ACCESS_READ);
+        }
         CV_Assert(devData != NULL);
 
-        CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
-        if (!src.isContinuous())
+        if (!alias)
         {
-            CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
-            CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
+            CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
+            if (!src.isContinuous())
+            {
+                CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
+                CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
+            }
         }
     }
 
@@ -4508,9 +4586,37 @@ Image2D::Image2D()
     p = NULL;
 }
 
-Image2D::Image2D(const UMat &src)
+Image2D::Image2D(const UMat &src, bool norm, bool alias)
 {
-    p = new Impl(src);
+    p = new Impl(src, norm, alias);
+}
+
+bool Image2D::canCreateAlias(const UMat &m)
+{
+    bool ret = false;
+    const Device & d = ocl::Device::getDefault();
+    if (d.imageFromBufferSupport())
+    {
+        // This is the required pitch alignment in pixels
+        uint pitchAlign = d.imagePitchAlignment();
+        if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
+        {
+            // We don't currently handle the case where the buffer was created
+            // with CL_MEM_USE_HOST_PTR
+            if (!m.u->tempUMat())
+            {
+                ret = true;
+            }
+        }
+    }
+    return ret;
+}
+
+bool Image2D::isFormatSupported(int depth, int cn, bool norm)
+{
+    cl_image_format format = Impl::getImageFormat(depth, cn, norm);
+
+    return Impl::isFormatSupported(format);
 }
 
 Image2D::Image2D(const Image2D & i)
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 45a66bd..03a9d79 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2037,15 +2037,6 @@ static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int *
     ofs_tab[dx] = k;
 }
 
-static void ocl_computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol)
-{
-    for (int i = 0; i < dcols; ++i)
-        dmap_tab[i] = scale * i;
-
-    for (int i = 0, size = dcols * scale; i < size; ++i)
-        smap_tab[i] = std::min(scol - 1, i);
-}
-
 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
                         double fx, double fy, int interpolation)
 {
@@ -2075,7 +2066,39 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     ocl::Kernel k;
     size_t globalsize[] = { dst.cols, dst.rows };
 
-    if (interpolation == INTER_LINEAR)
+    ocl::Image2D srcImage;
+
+    // See if this could be done with a sampler.  We stick with integer
+    // datatypes because the observed error is low.
+    bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
+                       ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
+                       ocl::Image2D::isFormatSupported(depth, cn, true));
+    if (useSampler)
+    {
+        int wdepth = std::max(depth, CV_32S);
+        char buf[2][32];
+        cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
+                        "-D convertToDT=%s -D cn=%d",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(depth),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                        cn);
+        k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
+
+        if(k.empty())
+        {
+            useSampler = false;
+        }
+        else
+        {
+            // Convert the input into an OpenCL image type, using normalized channel data types
+            // and aliasing the UMat.
+            srcImage = ocl::Image2D(src, true, true);
+            k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
+                   (float)inv_fx, (float)inv_fy);
+        }
+    }
+
+    if (interpolation == INTER_LINEAR && !useSampler)
     {
         char buf[2][32];
 
@@ -2180,25 +2203,14 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         {
             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
-                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
-                                               ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
-                                               ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
-                                  iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
+                                                " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
+                                                ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
+                                                ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
+                                    iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
 
             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
             if (k.empty())
                 return false;
-
-            int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y;
-            AutoBuffer<int> dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size);
-            int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols;
-            int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y;
-
-            ocl_computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols);
-            ocl_computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows);
-
-            Mat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab).copyTo(dmap);
-            Mat(1, smap_tab_size, CV_32SC1, (void *)smap_tab).copyTo(smap);
         }
         else
         {
@@ -2228,7 +2240,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
 
         if (is_area_fast)
-            k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(dmap), ocl::KernelArg::PtrReadOnly(smap));
+            k.args(srcarg, dstarg);
         else
             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index d656bf6..55ef069 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -67,7 +67,64 @@
 #define TSIZE (int)sizeof(T1)*cn
 #endif
 
-#ifdef INTER_LINEAR_INTEGER
+#if defined USE_SAMPLER
+
+#if cn == 1
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x
+#elif cn == 2
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy
+#elif cn == 3
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz
+#elif cn == 4
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+#define INTERMEDIATE_TYPE CAT(float, cn)
+#define float1 float
+
+#if depth == 0
+#define RESULT_SCALE    255.0f
+#elif depth == 1
+#define RESULT_SCALE    127.0f
+#elif depth == 2
+#define RESULT_SCALE    65535.0f
+#elif depth == 3
+#define RESULT_SCALE    32767.0f
+#else
+#define RESULT_SCALE    1.0f
+#endif
+
+__kernel void resizeSampler(__read_only image2d_t srcImage,
+                            __global uchar* dstptr, int dststep, int dstoffset,
+                            int dstrows, int dstcols,
+                            float ifx, float ify)
+{
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+                              CLK_ADDRESS_CLAMP_TO_EDGE |
+                              CLK_FILTER_LINEAR;
+
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);
+
+    INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));
+
+#if depth <= 4
+    T uval = convertToDT(round(intermediate * RESULT_SCALE));
+#else
+    T uval = convertToDT(intermediate * RESULT_SCALE);
+#endif
+
+    if(dx < dstcols && dy < dstrows)
+    {
+        storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));
+    }
+}
+
+#elif defined INTER_LINEAR_INTEGER
 
 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
@@ -185,8 +242,7 @@ __kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offs
 #ifdef INTER_AREA_FAST
 
 __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
-                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                              __global const int * dmap_tab, __global const int * smap_tab)
+                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -195,21 +251,21 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
     {
         int dst_index = mad24(dy, dst_step, dst_offset);
 
-        __global const int * xmap_tab = dmap_tab;
-        __global const int * ymap_tab = dmap_tab + dst_cols;
-        __global const int * sxmap_tab = smap_tab;
-        __global const int * symap_tab = smap_tab + XSCALE * dst_cols;
-
-        int sx = xmap_tab[dx], sy = ymap_tab[dy];
+        int sx = XSCALE * dx;
+        int sy = YSCALE * dy;
         WTV sum = (WTV)(0);
 
         #pragma unroll
-        for (int y = 0; y < YSCALE; ++y)
+        for (int py = 0; py < YSCALE; ++py)
         {
-            int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
+            int y = min(sy + py, src_rows - 1);
+            int src_index = mad24(y, src_step, src_offset);
             #pragma unroll
-            for (int x = 0; x < XSCALE; ++x)
-                sum += convertToWTV(loadpix(src + mad24(sxmap_tab[sx + x], TSIZE, src_index)));
+            for (int px = 0; px < XSCALE; ++px)
+            {
+                int x = min(sx + px, src_cols - 1);
+                sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
+            }
         }
 
         storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index f9ccef8..d59cf75 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -158,9 +158,10 @@ OCL_TEST_P(WarpPerspective, Mat)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //// resize
 
-PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
+PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool, int)
 {
     int type, interpolation;
+    int widthMultiple;
     double fx, fy;
     bool useRoi;
 
@@ -174,6 +175,7 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
         fy = GET_PARAM(2);
         interpolation = GET_PARAM(3);
         useRoi = GET_PARAM(4);
+        widthMultiple = GET_PARAM(5);
     }
 
     void random_roi()
@@ -181,6 +183,9 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
         CV_Assert(fx > 0 && fy > 0);
 
         Size srcRoiSize = randomSize(1, MAX_VALUE), dstRoiSize;
+        // Make sure the width is a multiple of the requested value, and no more
+        srcRoiSize.width &= ~((widthMultiple * 2) - 1);
+        srcRoiSize.width += widthMultiple;
         dstRoiSize.width = cvRound(srcRoiSize.width * fx);
         dstRoiSize.height = cvRound(srcRoiSize.height * fy);
 
@@ -334,14 +339,16 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Resize, Combine(
                             Values(0.5, 1.5, 2.0, 0.2),
                             Values(0.5, 1.5, 2.0, 0.2),
                             Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
-                            Bool()));
+                            Bool(),
+                            Values(1, 16)));
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
                             Values((MatType)CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
                             Values(0.7, 0.4, 0.5),
                             Values(0.3, 0.6, 0.5),
                             Values((Interpolation)INTER_AREA),
-                            Bool()));
+                            Bool(),
+                            Values(1, 16)));
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_LINEAR, Combine(
                             Values(CV_8U, CV_16U, CV_32F),
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 5b653c9..4f0f313 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -890,6 +890,26 @@ namespace cv
             std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
             std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
 
+            // allocate buffers with aligned pitch to be able to use cl_khr_image2d_from_buffer extention
+            // This is the required pitch alignment in pixels
+            int pitchAlign = (int)ocl::Device::getDefault().imagePitchAlignment();
+            if (pitchAlign>0)
+            {
+                prevPyr[0] = UMat(prevImg.rows,(prevImg.cols+pitchAlign-1)&(-pitchAlign),prevImg.type()).colRange(0,prevImg.cols);
+                nextPyr[0] = UMat(nextImg.rows,(nextImg.cols+pitchAlign-1)&(-pitchAlign),nextImg.type()).colRange(0,nextImg.cols);
+                for (int level = 1; level <= maxLevel; ++level)
+                {
+                    int cols,rows;
+                    // allocate buffers with aligned pitch to be able to use image on buffer extention
+                    cols = (prevPyr[level - 1].cols+1)/2;
+                    rows = (prevPyr[level - 1].rows+1)/2;
+                    prevPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),prevPyr[level-1].type()).colRange(0,cols);
+                    cols = (nextPyr[level - 1].cols+1)/2;
+                    rows = (nextPyr[level - 1].rows+1)/2;
+                    nextPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),nextPyr[level-1].type()).colRange(0,cols);
+                }
+            }
+
             prevImg.convertTo(prevPyr[0], CV_32F);
             nextImg.convertTo(nextPyr[0], CV_32F);
 
@@ -969,8 +989,10 @@ namespace cv
             if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
                 return false;
 
-            ocl::Image2D imageI(I);
-            ocl::Image2D imageJ(J);
+            CV_Assert(I.depth() == CV_32F && J.depth() == CV_32F);
+            ocl::Image2D imageI(I, false, ocl::Image2D::canCreateAlias(I));
+            ocl::Image2D imageJ(J, false, ocl::Image2D::canCreateAlias(J));
+
             int idxArg = 0;
             idxArg = kernel.set(idxArg, imageI); //image2d_t I
             idxArg = kernel.set(idxArg, imageJ); //image2d_t J
@@ -1070,7 +1092,9 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                            TermCriteria criteria,
                            int flags, double minEigThreshold )
 {
-    bool use_opencl = ocl::useOpenCL() && (_prevImg.isUMat() || _nextImg.isUMat());
+    bool use_opencl = ocl::useOpenCL() &&
+                      (_prevImg.isUMat() || _nextImg.isUMat()) &&
+                      ocl::Image2D::isFormatSupported(CV_32F, 1, false);
     if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/))
         return;
 
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index fad73ef..914f09c 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -352,7 +352,9 @@ OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
 
 void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
 {
-    CV_OCL_RUN(_flow.isUMat(), calc_ocl(_I0, _I1, _flow))
+    CV_OCL_RUN(_flow.isUMat() &&
+               ocl::Image2D::isFormatSupported(CV_32F, 1, false),
+               calc_ocl(_I0, _I1, _flow))
 
     Mat I0 = _I0.getMat();
     Mat I1 = _I1.getMat();