From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 22 Jan 2014 15:13:29 +0000 (+0400)
Subject: ported superres to T-API
X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~658^2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6ad4823f0c421fd3458817d347b8af10867a587a;p=profile%2Fivi%2Fopencv.git

ported superres to T-API
---

diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index dcbac6b..d9f06cb 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -217,7 +217,7 @@ public:
     virtual void createSameSize(const _InputArray& arr, int mtype) const;
     virtual void release() const;
     virtual void clear() const;
-    virtual void setTo(const _InputArray& value) const;
+    virtual void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
 };
 
 
diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
index 55abd7c..cec5846 100644
--- a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -5,6 +5,8 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
+//#define CV_OPENCL_RUN_VERBOSE
+
 #ifdef HAVE_OPENCL
 
 #ifdef CV_OPENCL_RUN_VERBOSE
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index af2ca7d..510b178 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2560,7 +2560,7 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
     return *(cuda::CudaMem*)obj;
 }
 
-void _OutputArray::setTo(const _InputArray& arr) const
+void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
 {
     int k = kind();
 
@@ -2569,10 +2569,16 @@ void _OutputArray::setTo(const _InputArray& arr) const
     else if( k == MAT || k == MATX || k == STD_VECTOR )
     {
         Mat m = getMat();
-        m.setTo(arr);
+        m.setTo(arr, mask);
     }
     else if( k == UMAT )
-        ((UMat*)obj)->setTo(arr);
+        ((UMat*)obj)->setTo(arr, mask);
+    else if( k == GPU_MAT )
+    {
+        Mat value = arr.getMat();
+        CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::GPU_MAT) );
+        ((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>((double *)value.data)), mask);
+    }
     else
         CV_Error(Error::StsNotImplemented, "");
 }
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 1182a3c..7514833 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -5,5 +5,4 @@ endif()
 set(the_description "Super Resolution")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef)
 ocv_define_module(superres opencv_imgproc opencv_video
-                  OPTIONAL opencv_highgui opencv_ocl
-                  opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec)
+                  OPTIONAL opencv_highgui opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec)
diff --git a/modules/superres/include/opencv2/superres.hpp b/modules/superres/include/opencv2/superres.hpp
index 26de781..3d96e0f 100644
--- a/modules/superres/include/opencv2/superres.hpp
+++ b/modules/superres/include/opencv2/superres.hpp
@@ -83,6 +83,8 @@ namespace cv
             virtual void initImpl(Ptr<FrameSource>& frameSource) = 0;
             virtual void processImpl(Ptr<FrameSource>& frameSource, OutputArray output) = 0;
 
+            bool isUmat_;
+
         private:
             Ptr<FrameSource> frameSource_;
             bool firstCall_;
diff --git a/modules/superres/perf/perf_superres.cpp b/modules/superres/perf/perf_superres.cpp
index 810460b..e8b3ef7 100644
--- a/modules/superres/perf/perf_superres.cpp
+++ b/modules/superres/perf/perf_superres.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
 using namespace std;
 using namespace std::tr1;
@@ -91,37 +92,26 @@ namespace
     class ZeroOpticalFlow : public DenseOpticalFlowExt
     {
     public:
-        void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
+        virtual void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
         {
             cv::Size size = frame0.size();
 
             if (!flow2.needed())
             {
                 flow1.create(size, CV_32FC2);
-
-                if (flow1.kind() == cv::_InputArray::GPU_MAT)
-                    flow1.getGpuMatRef().setTo(cv::Scalar::all(0));
-                else
-                    flow1.getMatRef().setTo(cv::Scalar::all(0));
+                flow1.setTo(cv::Scalar::all(0));
             }
             else
             {
                 flow1.create(size, CV_32FC1);
                 flow2.create(size, CV_32FC1);
 
-                if (flow1.kind() == cv::_InputArray::GPU_MAT)
-                    flow1.getGpuMatRef().setTo(cv::Scalar::all(0));
-                else
-                    flow1.getMatRef().setTo(cv::Scalar::all(0));
-
-                if (flow2.kind() == cv::_InputArray::GPU_MAT)
-                    flow2.getGpuMatRef().setTo(cv::Scalar::all(0));
-                else
-                    flow2.getMatRef().setTo(cv::Scalar::all(0));
+                flow1.setTo(cv::Scalar::all(0));
+                flow2.setTo(cv::Scalar::all(0));
             }
         }
 
-        void collectGarbage()
+        virtual void collectGarbage()
         {
         }
     };
@@ -181,3 +171,48 @@ PERF_TEST_P(Size_MatType, SuperResolution_BTVL1,
         CPU_SANITY_CHECK(dst);
     }
 }
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef Size_MatType SuperResolution_BTVL1;
+
+OCL_PERF_TEST_P(SuperResolution_BTVL1 ,BTVL1,
+            Combine(Values(szSmall64, szSmall128),
+                    Values(MatType(CV_8UC1), MatType(CV_8UC3))))
+{
+    Size_MatType_t params = GetParam();
+    const Size size = get<0>(params);
+    const int type = get<1>(params);
+
+    Mat frame(size, type);
+    UMat dst(1, 1, 0);
+    declare.in(frame, WARMUP_RNG);
+
+    const int scale = 2;
+    const int iterations = 50;
+    const int temporalAreaRadius = 1;
+
+    Ptr<DenseOpticalFlowExt> opticalFlow(new ZeroOpticalFlow);
+    Ptr<SuperResolution> superRes = createSuperResolution_BTVL1();
+
+    superRes->set("scale", scale);
+    superRes->set("iterations", iterations);
+    superRes->set("temporalAreaRadius", temporalAreaRadius);
+    superRes->set("opticalFlow", opticalFlow);
+
+    superRes->setInput(makePtr<OneFrameSource_CPU>(frame));
+
+    // skip first frame
+    superRes->nextFrame(dst);
+
+    OCL_TEST_CYCLE_N(10) superRes->nextFrame(dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/superres/perf/perf_superres_ocl.cpp b/modules/superres/perf/perf_superres_ocl.cpp
deleted file mode 100644
index 04a3f7e..0000000
--- a/modules/superres/perf/perf_superres_ocl.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_OPENCV_OCL
-
-#include "opencv2/ocl.hpp"
-using namespace std;
-using namespace testing;
-using namespace perf;
-using namespace cv;
-using namespace cv::superres;
-
-namespace
-{
-    class OneFrameSource_OCL : public FrameSource
-    {
-    public:
-        explicit OneFrameSource_OCL(const ocl::oclMat& frame) : frame_(frame) {}
-
-        void nextFrame(OutputArray frame)
-        {
-            ocl::getOclMatRef(frame) = frame_;
-        }
-        void reset()
-        {
-        }
-
-    private:
-        ocl::oclMat frame_;
-    };
-
-
-    class ZeroOpticalFlowOCL : public DenseOpticalFlowExt
-    {
-    public:
-        void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
-        {
-            ocl::oclMat& frame0_ = ocl::getOclMatRef(frame0);
-            ocl::oclMat& flow1_ = ocl::getOclMatRef(flow1);
-            ocl::oclMat& flow2_ = ocl::getOclMatRef(flow2);
-
-            cv::Size size = frame0_.size();
-
-            if(!flow2.needed())
-            {
-                flow1_.create(size, CV_32FC2);
-                flow1_.setTo(Scalar::all(0));
-            }
-            else
-            {
-                flow1_.create(size, CV_32FC1);
-                flow2_.create(size, CV_32FC1);
-
-                flow1_.setTo(Scalar::all(0));
-                flow2_.setTo(Scalar::all(0));
-            }
-        }
-
-        void collectGarbage()
-        {
-        }
-    };
-}
-
-PERF_TEST_P(Size_MatType, SuperResolution_BTVL1_OCL,
-    Combine(Values(szSmall64, szSmall128),
-    Values(MatType(CV_8UC1), MatType(CV_8UC3))))
-{
-    declare.time(5 * 60);
-
-    const Size size = std::tr1::get<0>(GetParam());
-    const int type = std::tr1::get<1>(GetParam());
-
-    Mat frame(size, type);
-    declare.in(frame, WARMUP_RNG);
-
-    ocl::oclMat frame_ocl;
-    frame_ocl.upload(frame);
-
-
-    const int scale = 2;
-    const int iterations = 50;
-    const int temporalAreaRadius = 1;
-    Ptr<DenseOpticalFlowExt> opticalFlowOcl(new ZeroOpticalFlowOCL);
-
-    Ptr<SuperResolution> superRes_ocl = createSuperResolution_BTVL1_OCL();
-
-    superRes_ocl->set("scale", scale);
-    superRes_ocl->set("iterations", iterations);
-    superRes_ocl->set("temporalAreaRadius", temporalAreaRadius);
-    superRes_ocl->set("opticalFlow", opticalFlowOcl);
-
-    superRes_ocl->setInput(makePtr<OneFrameSource_OCL>(frame_ocl));
-
-    ocl::oclMat dst_ocl;
-    superRes_ocl->nextFrame(dst_ocl);
-
-    TEST_CYCLE_N(10) superRes_ocl->nextFrame(dst_ocl);
-    frame_ocl.release();
-    CPU_SANITY_CHECK(dst_ocl);
-}
-#endif
diff --git a/modules/superres/src/btv_l1.cpp b/modules/superres/src/btv_l1.cpp
index bafce91..1e4aa48 100644
--- a/modules/superres/src/btv_l1.cpp
+++ b/modules/superres/src/btv_l1.cpp
@@ -44,6 +44,7 @@
 // Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 using namespace cv;
 using namespace cv::superres;
@@ -51,10 +52,17 @@ using namespace cv::superres::detail;
 
 namespace
 {
-    void calcRelativeMotions(const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions,
-                             std::vector<Mat>& relForwardMotions, std::vector<Mat>& relBackwardMotions,
-                             int baseIdx, Size size)
+#ifdef HAVE_OPENCL
+
+    bool ocl_calcRelativeMotions(InputArrayOfArrays _forwardMotions, InputArrayOfArrays _backwardMotions,
+                                 OutputArrayOfArrays _relForwardMotions, OutputArrayOfArrays _relBackwardMotions,
+                                 int baseIdx, const Size & size)
     {
+        std::vector<UMat> & forwardMotions = *(std::vector<UMat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<UMat> *)_backwardMotions.getObj(),
+                & relForwardMotions = *(std::vector<UMat> *)_relForwardMotions.getObj(),
+                & relBackwardMotions = *(std::vector<UMat> *)_relBackwardMotions.getObj();
+
         const int count = static_cast<int>(forwardMotions.size());
 
         relForwardMotions.resize(count);
@@ -68,20 +76,84 @@ namespace
         for (int i = baseIdx - 1; i >= 0; --i)
         {
             add(relForwardMotions[i + 1], forwardMotions[i], relForwardMotions[i]);
-
             add(relBackwardMotions[i + 1], backwardMotions[i + 1], relBackwardMotions[i]);
         }
 
         for (int i = baseIdx + 1; i < count; ++i)
         {
             add(relForwardMotions[i - 1], backwardMotions[i], relForwardMotions[i]);
+            add(relBackwardMotions[i - 1], forwardMotions[i - 1], relBackwardMotions[i]);
+        }
+
+        return true;
+    }
+
+#endif
+
+    void calcRelativeMotions(InputArrayOfArrays _forwardMotions, InputArrayOfArrays _backwardMotions,
+                             OutputArrayOfArrays _relForwardMotions, OutputArrayOfArrays _relBackwardMotions,
+                             int baseIdx, const Size & size)
+    {
+        CV_OCL_RUN(_forwardMotions.isUMatVector() && _backwardMotions.isUMatVector() &&
+                   _relForwardMotions.isUMatVector() && _relBackwardMotions.isUMatVector(),
+                   ocl_calcRelativeMotions(_forwardMotions, _backwardMotions, _relForwardMotions,
+                                           _relBackwardMotions, baseIdx, size))
+
+        std::vector<Mat> & forwardMotions = *(std::vector<Mat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<Mat> *)_backwardMotions.getObj(),
+                & relForwardMotions = *(std::vector<Mat> *)_relForwardMotions.getObj(),
+                & relBackwardMotions = *(std::vector<Mat> *)_relBackwardMotions.getObj();
+
+        const int count = static_cast<int>(forwardMotions.size());
+
+        relForwardMotions.resize(count);
+        relForwardMotions[baseIdx].create(size, CV_32FC2);
+        relForwardMotions[baseIdx].setTo(Scalar::all(0));
+
+        relBackwardMotions.resize(count);
+        relBackwardMotions[baseIdx].create(size, CV_32FC2);
+        relBackwardMotions[baseIdx].setTo(Scalar::all(0));
+
+        for (int i = baseIdx - 1; i >= 0; --i)
+        {
+            add(relForwardMotions[i + 1], forwardMotions[i], relForwardMotions[i]);
+            add(relBackwardMotions[i + 1], backwardMotions[i + 1], relBackwardMotions[i]);
+        }
 
+        for (int i = baseIdx + 1; i < count; ++i)
+        {
+            add(relForwardMotions[i - 1], backwardMotions[i], relForwardMotions[i]);
             add(relBackwardMotions[i - 1], forwardMotions[i - 1], relBackwardMotions[i]);
         }
     }
+#ifdef HAVE_OPENCL
 
-    void upscaleMotions(const std::vector<Mat>& lowResMotions, std::vector<Mat>& highResMotions, int scale)
+    bool ocl_upscaleMotions(InputArrayOfArrays _lowResMotions, OutputArrayOfArrays _highResMotions, int scale)
     {
+        std::vector<UMat> & lowResMotions = *(std::vector<UMat> *)_lowResMotions.getObj(),
+                & highResMotions = *(std::vector<UMat> *)_highResMotions.getObj();
+
+        highResMotions.resize(lowResMotions.size());
+
+        for (size_t i = 0; i < lowResMotions.size(); ++i)
+        {
+            resize(lowResMotions[i], highResMotions[i], Size(), scale, scale, INTER_LINEAR); // TODO
+            multiply(highResMotions[i], Scalar::all(scale), highResMotions[i]);
+        }
+
+        return true;
+    }
+
+#endif
+
+    void upscaleMotions(InputArrayOfArrays _lowResMotions, OutputArrayOfArrays _highResMotions, int scale)
+    {
+        CV_OCL_RUN(_lowResMotions.isUMatVector() && _highResMotions.isUMatVector(),
+                   ocl_upscaleMotions(_lowResMotions, _highResMotions, scale))
+
+        std::vector<Mat> & lowResMotions = *(std::vector<Mat> *)_lowResMotions.getObj(),
+                & highResMotions = *(std::vector<Mat> *)_highResMotions.getObj();
+
         highResMotions.resize(lowResMotions.size());
 
         for (size_t i = 0; i < lowResMotions.size(); ++i)
@@ -91,10 +163,47 @@ namespace
         }
     }
 
-    void buildMotionMaps(const Mat& forwardMotion, const Mat& backwardMotion, Mat& forwardMap, Mat& backwardMap)
+#ifdef HAVE_OPENCL
+
+    bool ocl_buildMotionMaps(InputArray _forwardMotion, InputArray _backwardMotion,
+                             OutputArray _forwardMap, OutputArray _backwardMap)
     {
-        forwardMap.create(forwardMotion.size(), CV_32FC2);
-        backwardMap.create(forwardMotion.size(), CV_32FC2);
+        ocl::Kernel k("buildMotionMaps", ocl::superres::superres_btvl1_oclsrc);
+        if (k.empty())
+            return false;
+
+        UMat forwardMotion = _forwardMotion.getUMat(), backwardMotion = _backwardMotion.getUMat();
+        Size size = forwardMotion.size();
+
+        _forwardMap.create(size, CV_32FC2);
+        _backwardMap.create(size, CV_32FC2);
+
+        UMat forwardMap = _forwardMap.getUMat(), backwardMap = _backwardMap.getUMat();
+
+        k.args(ocl::KernelArg::ReadOnlyNoSize(forwardMotion),
+               ocl::KernelArg::ReadOnlyNoSize(backwardMotion),
+               ocl::KernelArg::WriteOnlyNoSize(forwardMap),
+               ocl::KernelArg::WriteOnly(backwardMap));
+
+        size_t globalsize[2] = { size.width, size.height };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    void buildMotionMaps(InputArray _forwardMotion, InputArray _backwardMotion,
+                         OutputArray _forwardMap, OutputArray _backwardMap)
+    {
+        CV_OCL_RUN(_forwardMap.isUMat() && _backwardMap.isUMat(),
+                   ocl_buildMotionMaps(_forwardMotion, _backwardMotion, _forwardMap,
+                                       _backwardMap));
+
+        Mat forwardMotion = _forwardMotion.getMat(), backwardMotion = _backwardMotion.getMat();
+
+        _forwardMap.create(forwardMotion.size(), CV_32FC2);
+        _backwardMap.create(forwardMotion.size(), CV_32FC2);
+
+        Mat forwardMap = _forwardMap.getMat(), backwardMap = _backwardMap.getMat();
 
         for (int y = 0; y < forwardMotion.rows; ++y)
         {
@@ -114,40 +223,73 @@ namespace
     }
 
     template <typename T>
-    void upscaleImpl(const Mat& src, Mat& dst, int scale)
+    void upscaleImpl(InputArray _src, OutputArray _dst, int scale)
     {
-        dst.create(src.rows * scale, src.cols * scale, src.type());
-        dst.setTo(Scalar::all(0));
+        Mat src = _src.getMat();
+        _dst.create(src.rows * scale, src.cols * scale, src.type());
+        _dst.setTo(Scalar::all(0));
+        Mat dst = _dst.getMat();
 
         for (int y = 0, Y = 0; y < src.rows; ++y, Y += scale)
         {
-            const T* srcRow = src.ptr<T>(y);
-            T* dstRow = dst.ptr<T>(Y);
+            const T * const srcRow = src.ptr<T>(y);
+            T * const dstRow = dst.ptr<T>(Y);
 
             for (int x = 0, X = 0; x < src.cols; ++x, X += scale)
                 dstRow[X] = srcRow[x];
         }
     }
 
-    void upscale(const Mat& src, Mat& dst, int scale)
+#ifdef HAVE_OPENCL
+
+    static bool ocl_upscale(InputArray _src, OutputArray _dst, int scale)
+    {
+        int type = _src.type(), cn = CV_MAT_CN(type);
+        ocl::Kernel k("upscale", ocl::superres::superres_btvl1_oclsrc,
+                      format("-D cn=%d", cn));
+        if (k.empty())
+            return false;
+
+        UMat src = _src.getUMat();
+        _dst.create(src.rows * scale, src.cols * scale, type);
+        _dst.setTo(Scalar::all(0));
+        UMat dst = _dst.getUMat();
+
+        k.args(ocl::KernelArg::ReadOnly(src),
+               ocl::KernelArg::ReadWriteNoSize(dst), scale);
+
+        size_t globalsize[2] = { src.cols, src.rows };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    typedef struct _Point4f { float ar[4]; } Point4f;
+
+    void upscale(InputArray _src, OutputArray _dst, int scale)
     {
-        typedef void (*func_t)(const Mat& src, Mat& dst, int scale);
+        int cn = _src.channels();
+        CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_upscale(_src, _dst, scale))
+
+        typedef void (*func_t)(InputArray src, OutputArray dst, int scale);
         static const func_t funcs[] =
         {
-            0, upscaleImpl<float>, 0, upscaleImpl<Point3f>
+            0, upscaleImpl<float>, 0, upscaleImpl<Point3f>, upscaleImpl<Point4f>
         };
 
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
-
-        const func_t func = funcs[src.channels()];
-
-        func(src, dst, scale);
+        const func_t func = funcs[cn];
+        CV_Assert(func != 0);
+        func(_src, _dst, scale);
     }
 
-    float diffSign(float a, float b)
+    inline float diffSign(float a, float b)
     {
         return a > b ? 1.0f : a < b ? -1.0f : 0.0f;
     }
+
     Point3f diffSign(Point3f a, Point3f b)
     {
         return Point3f(
@@ -157,16 +299,44 @@ namespace
         );
     }
 
-    void diffSign(const Mat& src1, const Mat& src2, Mat& dst)
+#ifdef HAVE_OPENCL
+
+    static bool ocl_diffSign(InputArray _src1, OutputArray _src2, OutputArray _dst)
     {
-        const int count = src1.cols * src1.channels();
+        ocl::Kernel k("diffSign", ocl::superres::superres_btvl1_oclsrc);
+        if (k.empty())
+            return false;
+
+        UMat src1 = _src1.getUMat(), src2 = _src2.getUMat();
+        _dst.create(src1.size(), src1.type());
+        UMat dst = _dst.getUMat();
 
-        dst.create(src1.size(), src1.type());
+        int cn = src1.channels();
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
+               ocl::KernelArg::ReadOnlyNoSize(src2),
+               ocl::KernelArg::WriteOnly(dst, cn));
+
+        size_t globalsize[2] = { src1.cols * cn, src1.rows };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    void diffSign(InputArray _src1, OutputArray _src2, OutputArray _dst)
+    {
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_diffSign(_src1, _src2, _dst))
+
+        Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+        _dst.create(src1.size(), src1.type());
+        Mat dst = _dst.getMat();
+
+        const int count = src1.cols * src1.channels();
 
         for (int y = 0; y < src1.rows; ++y)
         {
-            const float* src1Ptr = src1.ptr<float>(y);
-            const float* src2Ptr = src2.ptr<float>(y);
+            const float * const src1Ptr = src1.ptr<float>(y);
+            const float * const src2Ptr = src2.ptr<float>(y);
             float* dstPtr = dst.ptr<float>(y);
 
             for (int x = 0; x < count; ++x)
@@ -206,8 +376,8 @@ namespace
     {
         for (int i = range.start; i < range.end; ++i)
         {
-            const T* srcRow = src.ptr<T>(i);
-            T* dstRow = dst.ptr<T>(i);
+            const T * const srcRow = src.ptr<T>(i);
+            T * const dstRow = dst.ptr<T>(i);
 
             for(int j = ksize; j < src.cols - ksize; ++j)
             {
@@ -219,19 +389,20 @@ namespace
                     const T* srcRow3 = src.ptr<T>(i + m);
 
                     for (int l = ksize; l + m >= 0; --l, ++ind)
-                    {
-                        dstRow[j] += btvWeights[ind] * (diffSign(srcVal, srcRow3[j + l]) - diffSign(srcRow2[j - l], srcVal));
-                    }
+                        dstRow[j] += btvWeights[ind] * (diffSign(srcVal, srcRow3[j + l])
+                                                        - diffSign(srcRow2[j - l], srcVal));
                 }
             }
         }
     }
 
     template <typename T>
-    void calcBtvRegularizationImpl(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights)
+    void calcBtvRegularizationImpl(InputArray _src, OutputArray _dst, int btvKernelSize, const std::vector<float>& btvWeights)
     {
-        dst.create(src.size(), src.type());
-        dst.setTo(Scalar::all(0));
+        Mat src = _src.getMat();
+        _dst.create(src.size(), src.type());
+        _dst.setTo(Scalar::all(0));
+        Mat dst = _dst.getMat();
 
         const int ksize = (btvKernelSize - 1) / 2;
 
@@ -245,17 +416,48 @@ namespace
         parallel_for_(Range(ksize, src.rows - ksize), body);
     }
 
-    void calcBtvRegularization(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights)
+#ifdef HAVE_OPENCL
+
+    static bool ocl_calcBtvRegularization(InputArray _src, OutputArray _dst, int btvKernelSize, const UMat & ubtvWeights)
+    {
+        int cn = _src.channels();
+        ocl::Kernel k("calcBtvRegularization", ocl::superres::superres_btvl1_oclsrc,
+                      format("-D cn=%d", cn));
+        if (k.empty())
+            return false;
+
+        UMat src = _src.getUMat();
+        _dst.create(src.size(), src.type());
+        _dst.setTo(Scalar::all(0));
+        UMat dst = _dst.getUMat();
+
+        const int ksize = (btvKernelSize - 1) / 2;
+
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
+              ksize, ocl::KernelArg::PtrReadOnly(ubtvWeights));
+
+        size_t globalsize[2] = { src.cols, src.rows };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    void calcBtvRegularization(InputArray _src, OutputArray _dst, int btvKernelSize,
+                               const std::vector<float>& btvWeights, const UMat & ubtvWeights)
     {
-        typedef void (*func_t)(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights);
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_calcBtvRegularization(_src, _dst, btvKernelSize, ubtvWeights))
+        (void)ubtvWeights;
+
+        typedef void (*func_t)(InputArray _src, OutputArray _dst, int btvKernelSize, const std::vector<float>& btvWeights);
         static const func_t funcs[] =
         {
-            0, calcBtvRegularizationImpl<float>, 0, calcBtvRegularizationImpl<Point3f>
+            0, calcBtvRegularizationImpl<float>, 0, calcBtvRegularizationImpl<Point3f>, 0
         };
 
-        const func_t func = funcs[src.channels()];
-
-        func(src, dst, btvKernelSize, btvWeights);
+        const func_t func = funcs[_src.channels()];
+        CV_Assert(func != 0);
+        func(_src, _dst, btvKernelSize, btvWeights);
     }
 
     class BTVL1_Base
@@ -263,9 +465,8 @@ namespace
     public:
         BTVL1_Base();
 
-        void process(const std::vector<Mat>& src, Mat& dst,
-                     const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions,
-                     int baseIdx);
+        void process(InputArrayOfArrays src, OutputArray dst, InputArrayOfArrays forwardMotions,
+                     InputArrayOfArrays backwardMotions, int baseIdx);
 
         void collectGarbage();
 
@@ -281,15 +482,21 @@ namespace
         Ptr<DenseOpticalFlowExt> opticalFlow_;
 
     private:
+        bool ocl_process(InputArrayOfArrays src, OutputArray dst, InputArrayOfArrays forwardMotions,
+                         InputArrayOfArrays backwardMotions, int baseIdx);
+
         Ptr<FilterEngine> filter_;
         int curBlurKernelSize_;
         double curBlurSigma_;
         int curSrcType_;
 
         std::vector<float> btvWeights_;
+        UMat ubtvWeights_;
+
         int curBtvKernelSize_;
         double curAlpha_;
 
+        // Mat
         std::vector<Mat> lowResForwardMotions_;
         std::vector<Mat> lowResBackwardMotions_;
 
@@ -303,6 +510,23 @@ namespace
 
         Mat diffTerm_, regTerm_;
         Mat a_, b_, c_;
+
+#ifdef HAVE_OPENCL
+        // UMat
+        std::vector<UMat> ulowResForwardMotions_;
+        std::vector<UMat> ulowResBackwardMotions_;
+
+        std::vector<UMat> uhighResForwardMotions_;
+        std::vector<UMat> uhighResBackwardMotions_;
+
+        std::vector<UMat> uforwardMaps_;
+        std::vector<UMat> ubackwardMaps_;
+
+        UMat uhighRes_;
+
+        UMat udiffTerm_, uregTerm_;
+        UMat ua_, ub_, uc_;
+#endif
     };
 
     BTVL1_Base::BTVL1_Base()
@@ -325,7 +549,101 @@ namespace
         curAlpha_ = -1.0;
     }
 
-    void BTVL1_Base::process(const std::vector<Mat>& src, Mat& dst, const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions, int baseIdx)
+#ifdef HAVE_OPENCL
+
+    bool BTVL1_Base::ocl_process(InputArrayOfArrays _src, OutputArray _dst, InputArrayOfArrays _forwardMotions,
+                                 InputArrayOfArrays _backwardMotions, int baseIdx)
+    {
+        std::vector<UMat> & src = *(std::vector<UMat> *)_src.getObj(),
+                & forwardMotions = *(std::vector<UMat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<UMat> *)_backwardMotions.getObj();
+
+        // update blur filter and btv weights
+        if (!filter_ || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
+        {
+            filter_ = createGaussianFilter(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+            curBlurKernelSize_ = blurKernelSize_;
+            curBlurSigma_ = blurSigma_;
+            curSrcType_ = src[0].type();
+        }
+
+        if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_)
+        {
+            calcBtvWeights(btvKernelSize_, alpha_, btvWeights_);
+            Mat(btvWeights_, true).copyTo(ubtvWeights_);
+
+            curBtvKernelSize_ = btvKernelSize_;
+            curAlpha_ = alpha_;
+        }
+
+        // calc high res motions
+        calcRelativeMotions(forwardMotions, backwardMotions, ulowResForwardMotions_, ulowResBackwardMotions_, baseIdx, src[0].size());
+
+        upscaleMotions(ulowResForwardMotions_, uhighResForwardMotions_, scale_);
+        upscaleMotions(ulowResBackwardMotions_, uhighResBackwardMotions_, scale_);
+
+        uforwardMaps_.resize(uhighResForwardMotions_.size());
+        ubackwardMaps_.resize(uhighResForwardMotions_.size());
+        for (size_t i = 0; i < uhighResForwardMotions_.size(); ++i)
+            buildMotionMaps(uhighResForwardMotions_[i], uhighResBackwardMotions_[i], uforwardMaps_[i], ubackwardMaps_[i]);
+
+        // initial estimation
+        const Size lowResSize = src[0].size();
+        const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
+
+        resize(src[baseIdx], uhighRes_, highResSize, 0, 0, INTER_LINEAR); // TODO
+
+        // iterations
+        udiffTerm_.create(highResSize, uhighRes_.type());
+        ua_.create(highResSize, uhighRes_.type());
+        ub_.create(highResSize, uhighRes_.type());
+        uc_.create(lowResSize, uhighRes_.type());
+
+        for (int i = 0; i < iterations_; ++i)
+        {
+            udiffTerm_.setTo(Scalar::all(0));
+
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                // a = M * Ih
+                remap(uhighRes_, ua_, ubackwardMaps_[k], noArray(), INTER_NEAREST);
+                // b = HM * Ih
+                GaussianBlur(ua_, ub_, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                // c = DHM * Ih
+                resize(ub_, uc_, lowResSize, 0, 0, INTER_NEAREST);
+
+                diffSign(src[k], uc_, uc_);
+
+                // a = Dt * diff
+                upscale(uc_, ua_, scale_);
+
+                // b = HtDt * diff
+                GaussianBlur(ua_, ub_, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                // a = MtHtDt * diff
+                remap(ub_, ua_, uforwardMaps_[k], noArray(), INTER_NEAREST);
+
+                add(udiffTerm_, ua_, udiffTerm_);
+            }
+
+            if (lambda_ > 0)
+            {
+                calcBtvRegularization(uhighRes_, uregTerm_, btvKernelSize_, btvWeights_, ubtvWeights_);
+                addWeighted(udiffTerm_, 1.0, uregTerm_, -lambda_, 0.0, udiffTerm_);
+            }
+
+            addWeighted(uhighRes_, 1.0, udiffTerm_, tau_, 0.0, uhighRes_);
+        }
+
+        Rect inner(btvKernelSize_, btvKernelSize_, uhighRes_.cols - 2 * btvKernelSize_, uhighRes_.rows - 2 * btvKernelSize_);
+        uhighRes_(inner).copyTo(_dst);
+
+        return true;
+    }
+
+#endif
+
+    void BTVL1_Base::process(InputArrayOfArrays _src, OutputArray _dst, InputArrayOfArrays _forwardMotions,
+                             InputArrayOfArrays _backwardMotions, int baseIdx)
     {
         CV_Assert( scale_ > 1 );
         CV_Assert( iterations_ > 0 );
@@ -335,8 +653,15 @@ namespace
         CV_Assert( blurKernelSize_ > 0 );
         CV_Assert( blurSigma_ >= 0.0 );
 
-        // update blur filter and btv weights
+        CV_OCL_RUN(_src.isUMatVector() && _dst.isUMat() && _forwardMotions.isUMatVector() &&
+                   _backwardMotions.isUMatVector(),
+                   ocl_process(_src, _dst, _forwardMotions, _backwardMotions, baseIdx))
+
+        std::vector<Mat> & src = *(std::vector<Mat> *)_src.getObj(),
+                & forwardMotions = *(std::vector<Mat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<Mat> *)_backwardMotions.getObj();
 
+        // update blur filter and btv weights
         if (!filter_ || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
         {
             filter_ = createGaussianFilter(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
@@ -353,7 +678,6 @@ namespace
         }
 
         // calc high res motions
-
         calcRelativeMotions(forwardMotions, backwardMotions, lowResForwardMotions_, lowResBackwardMotions_, baseIdx, src[0].size());
 
         upscaleMotions(lowResForwardMotions_, highResForwardMotions_, scale_);
@@ -365,14 +689,12 @@ namespace
             buildMotionMaps(highResForwardMotions_[i], highResBackwardMotions_[i], forwardMaps_[i], backwardMaps_[i]);
 
         // initial estimation
-
         const Size lowResSize = src[0].size();
         const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
 
         resize(src[baseIdx], highRes_, highResSize, 0, 0, INTER_CUBIC);
 
         // iterations
-
         diffTerm_.create(highResSize, highRes_.type());
         a_.create(highResSize, highRes_.type());
         b_.create(highResSize, highRes_.type());
@@ -405,7 +727,7 @@ namespace
 
             if (lambda_ > 0)
             {
-                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_, btvWeights_);
+                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_, btvWeights_, ubtvWeights_);
                 addWeighted(diffTerm_, 1.0, regTerm_, -lambda_, 0.0, diffTerm_);
             }
 
@@ -413,13 +735,14 @@ namespace
         }
 
         Rect inner(btvKernelSize_, btvKernelSize_, highRes_.cols - 2 * btvKernelSize_, highRes_.rows - 2 * btvKernelSize_);
-        highRes_(inner).copyTo(dst);
+        highRes_(inner).copyTo(_dst);
     }
 
     void BTVL1_Base::collectGarbage()
     {
         filter_.release();
 
+        // Mat
         lowResForwardMotions_.clear();
         lowResBackwardMotions_.clear();
 
@@ -436,11 +759,32 @@ namespace
         a_.release();
         b_.release();
         c_.release();
+
+#ifdef HAVE_OPENCL
+        // UMat
+        ulowResForwardMotions_.clear();
+        ulowResBackwardMotions_.clear();
+
+        uhighResForwardMotions_.clear();
+        uhighResBackwardMotions_.clear();
+
+        uforwardMaps_.clear();
+        ubackwardMaps_.clear();
+
+        uhighRes_.release();
+
+        udiffTerm_.release();
+        uregTerm_.release();
+        ua_.release();
+        ub_.release();
+        uc_.release();
+#endif
     }
 
 ////////////////////////////////////////////////////////////////////
 
-    class BTVL1 : public SuperResolution, private BTVL1_Base
+    class BTVL1 :
+            public SuperResolution, private BTVL1_Base
     {
     public:
         AlgorithmInfo* info() const;
@@ -451,14 +795,25 @@ namespace
 
     protected:
         void initImpl(Ptr<FrameSource>& frameSource);
+        bool ocl_initImpl(Ptr<FrameSource>& frameSource);
+
         void processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
+        bool ocl_processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
 
     private:
         int temporalAreaRadius_;
 
         void readNextFrame(Ptr<FrameSource>& frameSource);
+        bool ocl_readNextFrame(Ptr<FrameSource>& frameSource);
+
         void processFrame(int idx);
+        bool ocl_processFrame(int idx);
+
+        int storePos_;
+        int procPos_;
+        int outPos_;
 
+        // Mat
         Mat curFrame_;
         Mat prevFrame_;
 
@@ -467,14 +822,25 @@ namespace
         std::vector<Mat> backwardMotions_;
         std::vector<Mat> outputs_;
 
-        int storePos_;
-        int procPos_;
-        int outPos_;
-
         std::vector<Mat> srcFrames_;
         std::vector<Mat> srcForwardMotions_;
         std::vector<Mat> srcBackwardMotions_;
         Mat finalOutput_;
+
+#ifdef HAVE_OPENCL
+        // UMat
+        UMat ucurFrame_;
+        UMat uprevFrame_;
+
+        std::vector<UMat> uframes_;
+        std::vector<UMat> uforwardMotions_;
+        std::vector<UMat> ubackwardMotions_;
+        std::vector<UMat> uoutputs_;
+
+        std::vector<UMat> usrcFrames_;
+        std::vector<UMat> usrcForwardMotions_;
+        std::vector<UMat> usrcBackwardMotions_;
+#endif
     };
 
     CV_INIT_ALGORITHM(BTVL1, "SuperResolution.BTVL1",
@@ -496,6 +862,7 @@ namespace
 
     void BTVL1::collectGarbage()
     {
+        // Mat
         curFrame_.release();
         prevFrame_.release();
 
@@ -509,10 +876,52 @@ namespace
         srcBackwardMotions_.clear();
         finalOutput_.release();
 
+#ifdef HAVE_OPENCL
+        // UMat
+        ucurFrame_.release();
+        uprevFrame_.release();
+
+        uframes_.clear();
+        uforwardMotions_.clear();
+        ubackwardMotions_.clear();
+        uoutputs_.clear();
+
+        usrcFrames_.clear();
+        usrcForwardMotions_.clear();
+        usrcBackwardMotions_.clear();
+#endif
+
         SuperResolution::collectGarbage();
         BTVL1_Base::collectGarbage();
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_initImpl(Ptr<FrameSource>& frameSource)
+    {
+        const int cacheSize = 2 * temporalAreaRadius_ + 1;
+
+        uframes_.resize(cacheSize);
+        uforwardMotions_.resize(cacheSize);
+        ubackwardMotions_.resize(cacheSize);
+        uoutputs_.resize(cacheSize);
+
+        storePos_ = -1;
+
+        for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
+            readNextFrame(frameSource);
+
+        for (int i = 0; i <= temporalAreaRadius_; ++i)
+            processFrame(i);
+
+        procPos_ = temporalAreaRadius_;
+        outPos_ = -1;
+
+        return true;
+    }
+
+#endif
+
     void BTVL1::initImpl(Ptr<FrameSource>& frameSource)
     {
         const int cacheSize = 2 * temporalAreaRadius_ + 1;
@@ -522,6 +931,9 @@ namespace
         backwardMotions_.resize(cacheSize);
         outputs_.resize(cacheSize);
 
+        CV_OCL_RUN(isUmat_,
+                   ocl_initImpl(frameSource))
+
         storePos_ = -1;
 
         for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
@@ -534,6 +946,18 @@ namespace
         outPos_ = -1;
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_processImpl(Ptr<FrameSource>& /*frameSource*/, OutputArray _output)
+    {
+        const UMat& curOutput = at(outPos_, uoutputs_);
+        curOutput.convertTo(_output, CV_8U);
+
+        return true;
+    }
+
+#endif
+
     void BTVL1::processImpl(Ptr<FrameSource>& frameSource, OutputArray _output)
     {
         if (outPos_ >= storePos_)
@@ -549,11 +973,14 @@ namespace
             ++procPos_;
             processFrame(procPos_);
         }
-
         ++outPos_;
+
+        CV_OCL_RUN(isUmat_,
+                   ocl_processImpl(frameSource, _output))
+
         const Mat& curOutput = at(outPos_, outputs_);
 
-        if (_output.kind() < _InputArray::OPENGL_BUFFER)
+        if (_output.kind() < _InputArray::OPENGL_BUFFER || _output.isUMat())
             curOutput.convertTo(_output, CV_8U);
         else
         {
@@ -562,14 +989,41 @@ namespace
         }
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_readNextFrame(Ptr<FrameSource>& /*frameSource*/)
+    {
+        ucurFrame_.convertTo(at(storePos_, uframes_), CV_32F);
+
+        if (storePos_ > 0)
+        {
+            opticalFlow_->calc(uprevFrame_, ucurFrame_, at(storePos_ - 1, uforwardMotions_));
+            opticalFlow_->calc(ucurFrame_, uprevFrame_, at(storePos_, ubackwardMotions_));
+        }
+
+        ucurFrame_.copyTo(uprevFrame_);
+        return true;
+    }
+
+#endif
+
     void BTVL1::readNextFrame(Ptr<FrameSource>& frameSource)
     {
         frameSource->nextFrame(curFrame_);
-
         if (curFrame_.empty())
             return;
 
+#ifdef HAVE_OPENCL
+        if (isUmat_ && curFrame_.channels() == 1)
+            curFrame_.copyTo(ucurFrame_);
+        else
+            isUmat_ = false;
+#endif
         ++storePos_;
+
+        CV_OCL_RUN(isUmat_,
+                   ocl_readNextFrame(frameSource))
+
         curFrame_.convertTo(at(storePos_, frames_), CV_32F);
 
         if (storePos_ > 0)
@@ -581,8 +1035,47 @@ namespace
         curFrame_.copyTo(prevFrame_);
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_processFrame(int idx)
+    {
+        const int startIdx = std::max(idx - temporalAreaRadius_, 0);
+        const int procIdx = idx;
+        const int endIdx = std::min(startIdx + 2 * temporalAreaRadius_, storePos_);
+
+        const int count = endIdx - startIdx + 1;
+
+        usrcFrames_.resize(count);
+        usrcForwardMotions_.resize(count);
+        usrcBackwardMotions_.resize(count);
+
+        int baseIdx = -1;
+
+        for (int i = startIdx, k = 0; i <= endIdx; ++i, ++k)
+        {
+            if (i == procIdx)
+                baseIdx = k;
+
+            usrcFrames_[k] = at(i, uframes_);
+
+            if (i < endIdx)
+                usrcForwardMotions_[k] = at(i, uforwardMotions_);
+            if (i > startIdx)
+                usrcBackwardMotions_[k] = at(i, ubackwardMotions_);
+        }
+
+        process(usrcFrames_, at(idx, uoutputs_), usrcForwardMotions_, usrcBackwardMotions_, baseIdx);
+
+        return true;
+    }
+
+#endif
+
     void BTVL1::processFrame(int idx)
     {
+        CV_OCL_RUN(isUmat_,
+                   ocl_processFrame(idx))
+
         const int startIdx = std::max(idx - temporalAreaRadius_, 0);
         const int procIdx = idx;
         const int endIdx = std::min(startIdx + 2 * temporalAreaRadius_, storePos_);
diff --git a/modules/superres/src/btv_l1_ocl.cpp b/modules/superres/src/btv_l1_ocl.cpp
deleted file mode 100644
index cfaf583..0000000
--- a/modules/superres/src/btv_l1_ocl.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//		Jin Ma, jin@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-// S. Farsiu , D. Robinson, M. Elad, P. Milanfar. Fast and robust multiframe super resolution.
-// Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
-
-#include "precomp.hpp"
-
-#if !defined(HAVE_OPENCL) || !defined(HAVE_OPENCV_OCL)
-
-cv::Ptr<cv::superres::SuperResolution> cv::superres::createSuperResolution_BTVL1_OCL()
-{
-    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<SuperResolution>();
-}
-
-#else
-#include "opencl_kernels.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cv::superres;
-using namespace cv::superres::detail;
-
-static ProgramEntry superres_btvl1 = cv::ocl::superres::superres_btvl1;
-
-namespace cv
-{
-    namespace ocl
-    {
-        float* btvWeights_ = NULL;
-        size_t btvWeights_size = 0;
-        oclMat c_btvRegWeights;
-    }
-}
-
-namespace btv_l1_device_ocl
-{
-    void buildMotionMaps(const oclMat& forwardMotionX, const oclMat& forwardMotionY,
-        const oclMat& backwardMotionX, const oclMat& bacwardMotionY,
-        oclMat& forwardMapX, oclMat& forwardMapY,
-        oclMat& backwardMapX, oclMat& backwardMapY);
-
-    void upscale(const oclMat& src, oclMat& dst, int scale);
-
-    void diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst);
-
-    void calcBtvRegularization(const oclMat& src, oclMat& dst, int ksize);
-}
-
-void btv_l1_device_ocl::buildMotionMaps(const oclMat& forwardMotionX, const oclMat& forwardMotionY,
-    const oclMat& backwardMotionX, const oclMat& backwardMotionY,
-    oclMat& forwardMapX, oclMat& forwardMapY,
-    oclMat& backwardMapX, oclMat& backwardMapY)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {forwardMapX.cols, forwardMapX.rows, 1};
-
-    int forwardMotionX_step = (int)(forwardMotionX.step/forwardMotionX.elemSize());
-    int forwardMotionY_step = (int)(forwardMotionY.step/forwardMotionY.elemSize());
-    int backwardMotionX_step = (int)(backwardMotionX.step/backwardMotionX.elemSize());
-    int backwardMotionY_step = (int)(backwardMotionY.step/backwardMotionY.elemSize());
-    int forwardMapX_step = (int)(forwardMapX.step/forwardMapX.elemSize());
-    int forwardMapY_step = (int)(forwardMapY.step/forwardMapY.elemSize());
-    int backwardMapX_step = (int)(backwardMapX.step/backwardMapX.elemSize());
-    int backwardMapY_step = (int)(backwardMapY.step/backwardMapY.elemSize());
-
-    String kernel_name = "buildMotionMapsKernel";
-    vector< pair<size_t, const void*> > args;
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMotionX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMotionY.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMotionX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMotionY.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMapX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMapY.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMapX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMapY.data));
-
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionX.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionY.cols));
-
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionY_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMotionX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMotionY_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMapX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMapY_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMapX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMapY_step));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-}
-
-void btv_l1_device_ocl::upscale(const oclMat& src, oclMat& dst, int scale)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {src.cols, src.rows, 1};
-
-    int src_step = (int)(src.step/src.elemSize());
-    int dst_step = (int)(dst.step/dst.elemSize());
-
-    String kernel_name = "upscaleKernel";
-    vector< pair<size_t, const void*> > args;
-
-    int cn = src.oclchannels();
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dst.data));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&dst_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&scale));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&cn));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-
-}
-
-void btv_l1_device_ocl::diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst)
-{
-    Context* clCxt = Context::getContext();
-
-    oclMat src1_ = src1.reshape(1);
-    oclMat src2_ = src2.reshape(1);
-    oclMat dst_ = dst.reshape(1);
-
-    int src1_step = (int)(src1_.step/src1_.elemSize());
-    int src2_step = (int)(src2_.step/src2_.elemSize());
-    int dst_step = (int)(dst_.step/dst_.elemSize());
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {src1_.cols, src1_.rows, 1};
-
-    String kernel_name = "diffSignKernel";
-    vector< pair<size_t, const void*> > args;
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src1_.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src2_.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dst_.data));
-
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src1_.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src1_.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&dst_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src1_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src2_step));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-}
-
-void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, int ksize)
-{
-    Context* clCxt = Context::getContext();
-
-    oclMat src_ = src.reshape(1);
-    oclMat dst_ = dst.reshape(1);
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {src.cols, src.rows, 1};
-
-    int src_step = (int)(src_.step/src_.elemSize());
-    int dst_step = (int)(dst_.step/dst_.elemSize());
-
-    String kernel_name = "calcBtvRegularizationKernel";
-    vector< pair<size_t, const void*> > args;
-
-    int cn = src.oclchannels();
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src_.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dst_.data));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&dst_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&ksize));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&cn));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&c_btvRegWeights.data));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-}
-
-namespace
-{
-    void calcRelativeMotions(const vector<pair<oclMat, oclMat> >& forwardMotions, const vector<pair<oclMat, oclMat> >& backwardMotions,
-        vector<pair<oclMat, oclMat> >& relForwardMotions, vector<pair<oclMat, oclMat> >& relBackwardMotions,
-        int baseIdx, Size size)
-    {
-        const int count = static_cast<int>(forwardMotions.size());
-
-        relForwardMotions.resize(count);
-        relForwardMotions[baseIdx].first.create(size, CV_32FC1);
-        relForwardMotions[baseIdx].first.setTo(Scalar::all(0));
-        relForwardMotions[baseIdx].second.create(size, CV_32FC1);
-        relForwardMotions[baseIdx].second.setTo(Scalar::all(0));
-
-        relBackwardMotions.resize(count);
-        relBackwardMotions[baseIdx].first.create(size, CV_32FC1);
-        relBackwardMotions[baseIdx].first.setTo(Scalar::all(0));
-        relBackwardMotions[baseIdx].second.create(size, CV_32FC1);
-        relBackwardMotions[baseIdx].second.setTo(Scalar::all(0));
-
-        for (int i = baseIdx - 1; i >= 0; --i)
-        {
-            ocl::add(relForwardMotions[i + 1].first, forwardMotions[i].first, relForwardMotions[i].first);
-            ocl::add(relForwardMotions[i + 1].second, forwardMotions[i].second, relForwardMotions[i].second);
-
-            ocl::add(relBackwardMotions[i + 1].first, backwardMotions[i + 1].first, relBackwardMotions[i].first);
-            ocl::add(relBackwardMotions[i + 1].second, backwardMotions[i + 1].second, relBackwardMotions[i].second);
-        }
-
-        for (int i = baseIdx + 1; i < count; ++i)
-        {
-            ocl::add(relForwardMotions[i - 1].first, backwardMotions[i].first, relForwardMotions[i].first);
-            ocl::add(relForwardMotions[i - 1].second, backwardMotions[i].second, relForwardMotions[i].second);
-
-            ocl::add(relBackwardMotions[i - 1].first, forwardMotions[i - 1].first, relBackwardMotions[i].first);
-            ocl::add(relBackwardMotions[i - 1].second, forwardMotions[i - 1].second, relBackwardMotions[i].second);
-        }
-    }
-
-    void upscaleMotions(const vector<pair<oclMat, oclMat> >& lowResMotions, vector<pair<oclMat, oclMat> >& highResMotions, int scale)
-    {
-        highResMotions.resize(lowResMotions.size());
-
-        for (size_t i = 0; i < lowResMotions.size(); ++i)
-        {
-            ocl::resize(lowResMotions[i].first, highResMotions[i].first, Size(), scale, scale, INTER_LINEAR);
-            ocl::resize(lowResMotions[i].second, highResMotions[i].second, Size(), scale, scale, INTER_LINEAR);
-
-            ocl::multiply(scale, highResMotions[i].first, highResMotions[i].first);
-            ocl::multiply(scale, highResMotions[i].second, highResMotions[i].second);
-        }
-    }
-
-    void buildMotionMaps(const pair<oclMat, oclMat>& forwardMotion, const pair<oclMat, oclMat>& backwardMotion,
-        pair<oclMat, oclMat>& forwardMap, pair<oclMat, oclMat>& backwardMap)
-    {
-        forwardMap.first.create(forwardMotion.first.size(), CV_32FC1);
-        forwardMap.second.create(forwardMotion.first.size(), CV_32FC1);
-
-        backwardMap.first.create(forwardMotion.first.size(), CV_32FC1);
-        backwardMap.second.create(forwardMotion.first.size(), CV_32FC1);
-
-        btv_l1_device_ocl::buildMotionMaps(forwardMotion.first, forwardMotion.second,
-            backwardMotion.first, backwardMotion.second,
-            forwardMap.first, forwardMap.second,
-            backwardMap.first, backwardMap.second);
-    }
-
-    void upscale(const oclMat& src, oclMat& dst, int scale)
-    {
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
-
-        btv_l1_device_ocl::upscale(src, dst, scale);
-    }
-
-    void diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst)
-    {
-        dst.create(src1.size(), src1.type());
-
-        btv_l1_device_ocl::diffSign(src1, src2, dst);
-    }
-
-    void calcBtvWeights(int btvKernelSize, double alpha, vector<float>& btvWeights)
-    {
-        const size_t size = btvKernelSize * btvKernelSize;
-
-        btvWeights.resize(size);
-
-        const int ksize = (btvKernelSize - 1) / 2;
-        const float alpha_f = static_cast<float>(alpha);
-
-        for (int m = 0, ind = 0; m <= ksize; ++m)
-        {
-            for (int l = ksize; l + m >= 0; --l, ++ind)
-                btvWeights[ind] = pow(alpha_f, std::abs(m) + std::abs(l));
-        }
-
-        btvWeights_ = &btvWeights[0];
-        btvWeights_size = size;
-        Mat btvWeights_mheader(1, static_cast<int>(size), CV_32FC1, btvWeights_);
-        c_btvRegWeights = btvWeights_mheader;
-    }
-
-    void calcBtvRegularization(const oclMat& src, oclMat& dst, int btvKernelSize)
-    {
-        dst.create(src.size(), src.type());
-
-        const int ksize = (btvKernelSize - 1) / 2;
-
-        btv_l1_device_ocl::calcBtvRegularization(src, dst, ksize);
-    }
-
-    class BTVL1_OCL_Base
-    {
-    public:
-        BTVL1_OCL_Base();
-
-        void process(const vector<oclMat>& src, oclMat& dst,
-            const vector<pair<oclMat, oclMat> >& forwardMotions, const vector<pair<oclMat, oclMat> >& backwardMotions,
-            int baseIdx);
-
-        void collectGarbage();
-
-    protected:
-        int scale_;
-        int iterations_;
-        double lambda_;
-        double tau_;
-        double alpha_;
-        int btvKernelSize_;
-        int blurKernelSize_;
-        double blurSigma_;
-        Ptr<DenseOpticalFlowExt> opticalFlow_;
-
-    private:
-        vector<Ptr<cv::ocl::FilterEngine_GPU> > filters_;
-        int curBlurKernelSize_;
-        double curBlurSigma_;
-        int curSrcType_;
-
-        vector<float> btvWeights_;
-        int curBtvKernelSize_;
-        double curAlpha_;
-
-        vector<pair<oclMat, oclMat> > lowResForwardMotions_;
-        vector<pair<oclMat, oclMat> > lowResBackwardMotions_;
-
-        vector<pair<oclMat, oclMat> > highResForwardMotions_;
-        vector<pair<oclMat, oclMat> > highResBackwardMotions_;
-
-        vector<pair<oclMat, oclMat> > forwardMaps_;
-        vector<pair<oclMat, oclMat> > backwardMaps_;
-
-        oclMat highRes_;
-
-        vector<oclMat> diffTerms_;
-        oclMat a_, b_, c_, d_;
-        oclMat regTerm_;
-    };
-
-    BTVL1_OCL_Base::BTVL1_OCL_Base()
-    {
-        scale_ = 4;
-        iterations_ = 180;
-        lambda_ = 0.03;
-        tau_ = 1.3;
-        alpha_ = 0.7;
-        btvKernelSize_ = 7;
-        blurKernelSize_ = 5;
-        blurSigma_ = 0.0;
-        opticalFlow_ = createOptFlow_Farneback_OCL();
-
-        curBlurKernelSize_ = -1;
-        curBlurSigma_ = -1.0;
-        curSrcType_ = -1;
-
-        curBtvKernelSize_ = -1;
-        curAlpha_ = -1.0;
-    }
-
-    void BTVL1_OCL_Base::process(const vector<oclMat>& src, oclMat& dst,
-        const vector<pair<oclMat, oclMat> >& forwardMotions, const vector<pair<oclMat, oclMat> >& backwardMotions,
-        int baseIdx)
-    {
-        CV_Assert( scale_ > 1 );
-        CV_Assert( iterations_ > 0 );
-        CV_Assert( tau_ > 0.0 );
-        CV_Assert( alpha_ > 0.0 );
-        CV_Assert( btvKernelSize_ > 0 && btvKernelSize_ <= 16 );
-        CV_Assert( blurKernelSize_ > 0 );
-        CV_Assert( blurSigma_ >= 0.0 );
-
-        // update blur filter and btv weights
-
-        if (filters_.size() != src.size() || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
-        {
-            filters_.resize(src.size());
-            for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = cv::ocl::createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
-            curBlurKernelSize_ = blurKernelSize_;
-            curBlurSigma_ = blurSigma_;
-            curSrcType_ = src[0].type();
-        }
-
-        if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_)
-        {
-            calcBtvWeights(btvKernelSize_, alpha_, btvWeights_);
-            curBtvKernelSize_ = btvKernelSize_;
-            curAlpha_ = alpha_;
-        }
-
-        // calc motions between input frames
-
-        calcRelativeMotions(forwardMotions, backwardMotions,
-            lowResForwardMotions_, lowResBackwardMotions_,
-            baseIdx, src[0].size());
-
-        upscaleMotions(lowResForwardMotions_, highResForwardMotions_, scale_);
-        upscaleMotions(lowResBackwardMotions_, highResBackwardMotions_, scale_);
-
-        forwardMaps_.resize(highResForwardMotions_.size());
-        backwardMaps_.resize(highResForwardMotions_.size());
-        for (size_t i = 0; i < highResForwardMotions_.size(); ++i)
-        {
-            buildMotionMaps(highResForwardMotions_[i], highResBackwardMotions_[i], forwardMaps_[i], backwardMaps_[i]);
-        }
-        // initial estimation
-
-        const Size lowResSize = src[0].size();
-        const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
-
-        ocl::resize(src[baseIdx], highRes_, highResSize, 0, 0, INTER_LINEAR);
-
-        // iterations
-
-        diffTerms_.resize(src.size());
-        bool d_inited = false;
-        a_.create(highRes_.size(), highRes_.type());
-        b_.create(highRes_.size(), highRes_.type());
-        c_.create(lowResSize, highRes_.type());
-        d_.create(highRes_.rows, highRes_.cols, highRes_.type());
-        for (int i = 0; i < iterations_; ++i)
-        {
-            if(!d_inited)
-            {
-                d_.setTo(0);
-                d_inited = true;
-            }
-            for (size_t k = 0; k < src.size(); ++k)
-            {
-                diffTerms_[k].create(highRes_.size(), highRes_.type());
-                // a = M * Ih
-                ocl::remap(highRes_, a_, backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar());
-                // b = HM * Ih
-                filters_[k]->apply(a_, b_, Rect(0,0,-1,-1));
-                // c = DHF * Ih
-                ocl::resize(b_, c_, lowResSize, 0, 0, INTER_NEAREST);
-
-                diffSign(src[k], c_, c_);
-
-                // a = Dt * diff
-                upscale(c_, d_, scale_);
-                // b = HtDt * diff
-                filters_[k]->apply(d_, b_, Rect(0,0,-1,-1));
-                // diffTerm = MtHtDt * diff
-                ocl::remap(b_, diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar());
-            }
-
-            if (lambda_ > 0)
-            {
-                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_);
-                ocl::addWeighted(highRes_, 1.0, regTerm_, -tau_ * lambda_, 0.0, highRes_);
-            }
-
-            for (size_t k = 0; k < src.size(); ++k)
-            {
-                ocl::addWeighted(highRes_, 1.0, diffTerms_[k], tau_, 0.0, highRes_);
-            }
-        }
-
-        Rect inner(btvKernelSize_, btvKernelSize_, highRes_.cols - 2 * btvKernelSize_, highRes_.rows - 2 * btvKernelSize_);
-        highRes_(inner).copyTo(dst);
-    }
-
-    void BTVL1_OCL_Base::collectGarbage()
-    {
-        filters_.clear();
-
-        lowResForwardMotions_.clear();
-        lowResBackwardMotions_.clear();
-
-        highResForwardMotions_.clear();
-        highResBackwardMotions_.clear();
-
-        forwardMaps_.clear();
-        backwardMaps_.clear();
-
-        highRes_.release();
-
-        diffTerms_.clear();
-        a_.release();
-        b_.release();
-        c_.release();
-        regTerm_.release();
-        c_btvRegWeights.release();
-    }
-
-    ////////////////////////////////////////////////////////////
-
-    class BTVL1_OCL : public SuperResolution, private BTVL1_OCL_Base
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        BTVL1_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void initImpl(Ptr<FrameSource>& frameSource);
-        void processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
-
-    private:
-        int temporalAreaRadius_;
-
-        void readNextFrame(Ptr<FrameSource>& frameSource);
-        void processFrame(int idx);
-
-        oclMat curFrame_;
-        oclMat prevFrame_;
-
-        vector<oclMat> frames_;
-        vector<pair<oclMat, oclMat> > forwardMotions_;
-        vector<pair<oclMat, oclMat> > backwardMotions_;
-        vector<oclMat> outputs_;
-
-        int storePos_;
-        int procPos_;
-        int outPos_;
-
-        vector<oclMat> srcFrames_;
-        vector<pair<oclMat, oclMat> > srcForwardMotions_;
-        vector<pair<oclMat, oclMat> > srcBackwardMotions_;
-        oclMat finalOutput_;
-    };
-
-    CV_INIT_ALGORITHM(BTVL1_OCL, "SuperResolution.BTVL1_OCL",
-    obj.info()->addParam(obj, "scale", obj.scale_, false, 0, 0, "Scale factor.");
-    obj.info()->addParam(obj, "iterations", obj.iterations_, false, 0, 0, "Iteration count.");
-    obj.info()->addParam(obj, "tau", obj.tau_, false, 0, 0, "Asymptotic value of steepest descent method.");
-    obj.info()->addParam(obj, "lambda", obj.lambda_, false, 0, 0, "Weight parameter to balance data term and smoothness term.");
-    obj.info()->addParam(obj, "alpha", obj.alpha_, false, 0, 0, "Parameter of spacial distribution in Bilateral-TV.");
-    obj.info()->addParam(obj, "btvKernelSize", obj.btvKernelSize_, false, 0, 0, "Kernel size of Bilateral-TV filter.");
-    obj.info()->addParam(obj, "blurKernelSize", obj.blurKernelSize_, false, 0, 0, "Gaussian blur kernel size.");
-    obj.info()->addParam(obj, "blurSigma", obj.blurSigma_, false, 0, 0, "Gaussian blur sigma.");
-    obj.info()->addParam(obj, "temporalAreaRadius", obj.temporalAreaRadius_, false, 0, 0, "Radius of the temporal search area.");
-    obj.info()->addParam<DenseOpticalFlowExt>(obj, "opticalFlow", obj.opticalFlow_, false, 0, 0, "Dense optical flow algorithm."))
-
-    BTVL1_OCL::BTVL1_OCL()
-    {
-        temporalAreaRadius_ = 4;
-    }
-
-    void BTVL1_OCL::collectGarbage()
-    {
-        curFrame_.release();
-        prevFrame_.release();
-
-        frames_.clear();
-        forwardMotions_.clear();
-        backwardMotions_.clear();
-        outputs_.clear();
-
-        srcFrames_.clear();
-        srcForwardMotions_.clear();
-        srcBackwardMotions_.clear();
-        finalOutput_.release();
-
-        SuperResolution::collectGarbage();
-        BTVL1_OCL_Base::collectGarbage();
-    }
-
-    void BTVL1_OCL::initImpl(Ptr<FrameSource>& frameSource)
-    {
-        const int cacheSize = 2 * temporalAreaRadius_ + 1;
-
-        frames_.resize(cacheSize);
-        forwardMotions_.resize(cacheSize);
-        backwardMotions_.resize(cacheSize);
-        outputs_.resize(cacheSize);
-
-        storePos_ = -1;
-
-        for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
-            readNextFrame(frameSource);
-
-        for (int i = 0; i <= temporalAreaRadius_; ++i)
-            processFrame(i);
-
-        procPos_ = temporalAreaRadius_;
-        outPos_ = -1;
-    }
-
-    void BTVL1_OCL::processImpl(Ptr<FrameSource>& frameSource, OutputArray _output)
-    {
-        if (outPos_ >= storePos_)
-        {
-            if(_output.kind() == _InputArray::OCL_MAT)
-            {
-                getOclMatRef(_output).release();
-            }
-            else
-            {
-                _output.release();
-            }
-            return;
-        }
-
-        readNextFrame(frameSource);
-
-        if (procPos_ < storePos_)
-        {
-            ++procPos_;
-            processFrame(procPos_);
-        }
-
-        ++outPos_;
-        const oclMat& curOutput = at(outPos_, outputs_);
-
-        if (_output.kind() == _InputArray::OCL_MAT)
-            curOutput.convertTo(getOclMatRef(_output), CV_8U);
-        else
-        {
-            curOutput.convertTo(finalOutput_, CV_8U);
-            arrCopy(finalOutput_, _output);
-        }
-    }
-
-    void BTVL1_OCL::readNextFrame(Ptr<FrameSource>& frameSource)
-    {
-        curFrame_.release();
-        frameSource->nextFrame(curFrame_);
-
-        if (curFrame_.empty())
-            return;
-
-        ++storePos_;
-        curFrame_.convertTo(at(storePos_, frames_), CV_32F);
-
-        if (storePos_ > 0)
-        {
-            pair<oclMat, oclMat>& forwardMotion = at(storePos_ - 1, forwardMotions_);
-            pair<oclMat, oclMat>& backwardMotion = at(storePos_, backwardMotions_);
-
-            opticalFlow_->calc(prevFrame_, curFrame_, forwardMotion.first, forwardMotion.second);
-            opticalFlow_->calc(curFrame_, prevFrame_, backwardMotion.first, backwardMotion.second);
-        }
-
-        curFrame_.copyTo(prevFrame_);
-    }
-
-    void BTVL1_OCL::processFrame(int idx)
-    {
-        const int startIdx = max(idx - temporalAreaRadius_, 0);
-        const int procIdx = idx;
-        const int endIdx = min(startIdx + 2 * temporalAreaRadius_, storePos_);
-
-        const int count = endIdx - startIdx + 1;
-
-        srcFrames_.resize(count);
-        srcForwardMotions_.resize(count);
-        srcBackwardMotions_.resize(count);
-
-        int baseIdx = -1;
-
-        for (int i = startIdx, k = 0; i <= endIdx; ++i, ++k)
-        {
-            if (i == procIdx)
-                baseIdx = k;
-
-            srcFrames_[k] = at(i, frames_);
-
-            if (i < endIdx)
-                srcForwardMotions_[k] = at(i, forwardMotions_);
-            if (i > startIdx)
-                srcBackwardMotions_[k] = at(i, backwardMotions_);
-        }
-
-        process(srcFrames_, at(idx, outputs_), srcForwardMotions_, srcBackwardMotions_, baseIdx);
-    }
-}
-
-Ptr<SuperResolution> cv::superres::createSuperResolution_BTVL1_OCL()
-{
-    return makePtr<BTVL1_OCL>();
-}
-#endif
diff --git a/modules/superres/src/frame_source.cpp b/modules/superres/src/frame_source.cpp
index 14481b8..c572c09 100644
--- a/modules/superres/src/frame_source.cpp
+++ b/modules/superres/src/frame_source.cpp
@@ -115,25 +115,18 @@ namespace
     void CaptureFrameSource::nextFrame(OutputArray _frame)
     {
         if (_frame.kind() == _InputArray::MAT)
-        {
             vc_ >> _frame.getMatRef();
-        }
         else if(_frame.kind() == _InputArray::GPU_MAT)
         {
             vc_ >> frame_;
             arrCopy(frame_, _frame);
         }
-        else if(_frame.kind() == _InputArray::OCL_MAT)
-        {
-            vc_ >> frame_;
-            if(!frame_.empty())
-            {
-                arrCopy(frame_, _frame);
-            }
-        }
+        else if (_frame.isUMat())
+            vc_ >> *(UMat *)_frame.getObj();
         else
         {
-            //should never get here
+            // should never get here
+            CV_Assert(0);
         }
     }
 
diff --git a/modules/superres/src/input_array_utility.cpp b/modules/superres/src/input_array_utility.cpp
index 5b87267..6b306d2 100644
--- a/modules/superres/src/input_array_utility.cpp
+++ b/modules/superres/src/input_array_utility.cpp
@@ -62,6 +62,23 @@ Mat cv::superres::arrGetMat(InputArray arr, Mat& buf)
     }
 }
 
+UMat cv::superres::arrGetUMat(InputArray arr, UMat& buf)
+{
+    switch (arr.kind())
+    {
+    case _InputArray::GPU_MAT:
+        arr.getGpuMat().download(buf);
+        return buf;
+
+    case _InputArray::OPENGL_BUFFER:
+        arr.getOGlBuffer().copyTo(buf);
+        return buf;
+
+    default:
+        return arr.getUMat();
+    }
+}
+
 GpuMat cv::superres::arrGetGpuMat(InputArray arr, GpuMat& buf)
 {
     switch (arr.kind())
@@ -108,62 +125,39 @@ namespace
     {
         src.getGpuMat().copyTo(dst.getGpuMatRef());
     }
-#ifdef HAVE_OPENCV_OCL
-    void ocl2mat(InputArray src, OutputArray dst)
-    {
-        dst.getMatRef() = (Mat)ocl::getOclMatRef(src);
-    }
-    void mat2ocl(InputArray src, OutputArray dst)
-    {
-        Mat m = src.getMat();
-        ocl::getOclMatRef(dst) = (ocl::oclMat)m;
-    }
-    void ocl2ocl(InputArray src, OutputArray dst)
-    {
-        ocl::getOclMatRef(src).copyTo(ocl::getOclMatRef(dst));
-    }
-#else
-    void ocl2mat(InputArray, OutputArray)
-    {
-        CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");;
-    }
-    void mat2ocl(InputArray, OutputArray)
-    {
-        CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");;
-    }
-    void ocl2ocl(InputArray, OutputArray)
-    {
-        CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    }
-#endif
 }
 
 void cv::superres::arrCopy(InputArray src, OutputArray dst)
 {
+    if (dst.isUMat() || src.isUMat())
+    {
+        src.copyTo(dst);
+        return;
+    }
+
     typedef void (*func_t)(InputArray src, OutputArray dst);
-    static const func_t funcs[11][11] =
+    static const func_t funcs[10][10] =
     {
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, 0 /*buf2arr*/, buf2arr, 0      },
-        {0, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0},
-        {0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, 0 /*arr2tex*/, gpu2gpu, 0      },
-        {0, ocl2mat, ocl2mat, ocl2mat, ocl2mat, ocl2mat, ocl2mat, 0,       0,             0,       ocl2ocl}
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, 0, buf2arr },
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+        { 0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, 0 , gpu2gpu },
     };
 
     const int src_kind = src.kind() >> _InputArray::KIND_SHIFT;
     const int dst_kind = dst.kind() >> _InputArray::KIND_SHIFT;
 
-    CV_DbgAssert( src_kind >= 0 && src_kind < 11 );
-    CV_DbgAssert( dst_kind >= 0 && dst_kind < 11 );
+    CV_Assert( src_kind >= 0 && src_kind < 10 );
+    CV_Assert( dst_kind >= 0 && dst_kind < 10 );
 
     const func_t func = funcs[src_kind][dst_kind];
-    CV_DbgAssert( func != 0 );
+    CV_Assert( func != 0 );
 
     func(src, dst);
 }
@@ -172,20 +166,21 @@ namespace
 {
     void convertToCn(InputArray src, OutputArray dst, int cn)
     {
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+        int scn = src.channels();
+        CV_Assert( scn == 1 || scn == 3 || scn == 4 );
         CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
         static const int codes[5][5] =
         {
-            {-1, -1, -1, -1, -1},
-            {-1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA},
-            {-1, -1, -1, -1, -1},
-            {-1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA},
-            {-1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1},
+            { -1, -1, -1, -1, -1 },
+            { -1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA },
+            { -1, -1, -1, -1, -1 },
+            { -1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA },
+            { -1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1 }
         };
 
-        const int code = codes[src.channels()][cn];
-        CV_DbgAssert( code >= 0 );
+        const int code = codes[scn][cn];
+        CV_Assert( code >= 0 );
 
         switch (src.kind())
         {
@@ -202,6 +197,7 @@ namespace
             break;
         }
     }
+
     void convertToDepth(InputArray src, OutputArray dst, int depth)
     {
         CV_Assert( src.depth() <= CV_64F );
@@ -226,6 +222,11 @@ namespace
             src.getGpuMat().convertTo(dst.getGpuMatRef(), depth, scale);
             break;
 
+        case _InputArray::UMAT:
+        case _InputArray::UEXPR:
+            src.getUMat().convertTo(dst, depth, scale);
+            break;
+
         default:
             src.getMat().convertTo(dst, depth, scale);
             break;
@@ -258,7 +259,7 @@ Mat cv::superres::convertToType(const Mat& src, int type, Mat& buf0, Mat& buf1)
     return buf1;
 }
 
-GpuMat cv::superres::convertToType(const GpuMat& src, int type, GpuMat& buf0, GpuMat& buf1)
+UMat cv::superres::convertToType(const UMat& src, int type, UMat& buf0, UMat& buf1)
 {
     if (src.type() == type)
         return src;
@@ -282,49 +283,8 @@ GpuMat cv::superres::convertToType(const GpuMat& src, int type, GpuMat& buf0, Gp
     convertToDepth(buf0, buf1, depth);
     return buf1;
 }
-#ifdef HAVE_OPENCV_OCL
-namespace
-{
-    // TODO(pengx17): remove these overloaded functions until IntputArray fully supports oclMat
-    void convertToCn(const ocl::oclMat& src, ocl::oclMat& dst, int cn)
-    {
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
-        CV_Assert( cn == 1 || cn == 3 || cn == 4 );
-
-        static const int codes[5][5] =
-        {
-            {-1, -1, -1, -1, -1},
-            {-1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA},
-            {-1, -1, -1, -1, -1},
-            {-1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA},
-            {-1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1},
-        };
-
-        const int code = codes[src.channels()][cn];
-        CV_DbgAssert( code >= 0 );
 
-        ocl::cvtColor(src, dst, code, cn);
-    }
-    void convertToDepth(const ocl::oclMat& src, ocl::oclMat& dst, int depth)
-    {
-        CV_Assert( src.depth() <= CV_64F );
-        CV_Assert( depth == CV_8U || depth == CV_32F );
-
-        static const double maxVals[] =
-        {
-            std::numeric_limits<uchar>::max(),
-            std::numeric_limits<schar>::max(),
-            std::numeric_limits<ushort>::max(),
-            std::numeric_limits<short>::max(),
-            std::numeric_limits<int>::max(),
-            1.0,
-            1.0,
-        };
-        const double scale = maxVals[depth] / maxVals[src.depth()];
-        src.convertTo(dst, depth, scale);
-    }
-}
-ocl::oclMat cv::superres::convertToType(const ocl::oclMat& src, int type, ocl::oclMat& buf0, ocl::oclMat& buf1)
+GpuMat cv::superres::convertToType(const GpuMat& src, int type, GpuMat& buf0, GpuMat& buf1)
 {
     if (src.type() == type)
         return src;
@@ -348,4 +308,3 @@ ocl::oclMat cv::superres::convertToType(const ocl::oclMat& src, int type, ocl::o
     convertToDepth(buf0, buf1, depth);
     return buf1;
 }
-#endif
diff --git a/modules/superres/src/input_array_utility.hpp b/modules/superres/src/input_array_utility.hpp
index 6f17da0..3a858fb 100644
--- a/modules/superres/src/input_array_utility.hpp
+++ b/modules/superres/src/input_array_utility.hpp
@@ -45,25 +45,20 @@
 
 #include "opencv2/core.hpp"
 #include "opencv2/core/cuda.hpp"
-#ifdef HAVE_OPENCV_OCL
-#include "opencv2/ocl.hpp"
-#endif
 
 namespace cv
 {
     namespace superres
     {
         CV_EXPORTS Mat arrGetMat(InputArray arr, Mat& buf);
+        CV_EXPORTS UMat arrGetUMat(InputArray arr, UMat& buf);
         CV_EXPORTS cuda::GpuMat arrGetGpuMat(InputArray arr, cuda::GpuMat& buf);
 
         CV_EXPORTS void arrCopy(InputArray src, OutputArray dst);
 
         CV_EXPORTS Mat convertToType(const Mat& src, int type, Mat& buf0, Mat& buf1);
+        CV_EXPORTS UMat convertToType(const UMat& src, int type, UMat& buf0, UMat& buf1);
         CV_EXPORTS cuda::GpuMat convertToType(const cuda::GpuMat& src, int type, cuda::GpuMat& buf0, cuda::GpuMat& buf1);
-
-#ifdef HAVE_OPENCV_OCL
-        CV_EXPORTS ocl::oclMat convertToType(const ocl::oclMat& src, int type, ocl::oclMat& buf0, ocl::oclMat& buf1);
-#endif
     }
 }
 
diff --git a/modules/superres/src/opencl/superres_btvl1.cl b/modules/superres/src/opencl/superres_btvl1.cl
index 3c0cff8..b0e11aa 100644
--- a/modules/superres/src/opencl/superres_btvl1.cl
+++ b/modules/superres/src/opencl/superres_btvl1.cl
@@ -43,160 +43,137 @@
 //
 //M*/
 
-__kernel void buildMotionMapsKernel(__global float* forwardMotionX,
-                                    __global float* forwardMotionY,
-                                    __global float* backwardMotionX,
-                                    __global float* backwardMotionY,
-                                    __global float* forwardMapX,
-                                    __global float* forwardMapY,
-                                    __global float* backwardMapX,
-                                    __global float* backwardMapY,
-                                    int forwardMotionX_row,
-                                    int forwardMotionX_col,
-                                    int forwardMotionX_step,
-                                    int forwardMotionY_step,
-                                    int backwardMotionX_step,
-                                    int backwardMotionY_step,
-                                    int forwardMapX_step,
-                                    int forwardMapY_step,
-                                    int backwardMapX_step,
-                                    int backwardMapY_step
-                                   )
+#ifndef cn
+#define cn 1
+#endif
+
+#define sz (int)sizeof(float)
+#define src_elem_at(_src, y, step, x) *(__global const float *)(_src + mad24(y, step, (x) * sz))
+#define dst_elem_at(_dst, y, step, x) *(__global float *)(_dst + mad24(y, step, (x) * sz))
+
+__kernel void buildMotionMaps(__global const uchar * forwardMotionPtr, int forwardMotion_step, int forwardMotion_offset,
+                              __global const uchar * backwardMotionPtr, int backwardMotion_step, int backwardMotion_offset,
+                              __global const uchar * forwardMapPtr, int forwardMap_step, int forwardMap_offset,
+                              __global const uchar * backwardMapPtr, int backwardMap_step, int backwardMap_offset,
+                              int rows, int cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < forwardMotionX_col && y < forwardMotionX_row)
+    if (x < cols && y < rows)
     {
-        float fx = forwardMotionX[y * forwardMotionX_step + x];
-        float fy = forwardMotionY[y * forwardMotionY_step + x];
+        int forwardMotion_index = mad24(forwardMotion_step, y, (int)sizeof(float2) * x + forwardMotion_offset);
+        int backwardMotion_index = mad24(backwardMotion_step, y, (int)sizeof(float2) * x + backwardMotion_offset);
+        int forwardMap_index = mad24(forwardMap_step, y, (int)sizeof(float2) * x + forwardMap_offset);
+        int backwardMap_index = mad24(backwardMap_step, y, (int)sizeof(float2) * x + backwardMap_offset);
 
-        float bx = backwardMotionX[y * backwardMotionX_step + x];
-        float by = backwardMotionY[y * backwardMotionY_step + x];
+        float2 forwardMotion = *(__global const float2 *)(forwardMotionPtr + forwardMotion_index);
+        float2 backwardMotion = *(__global const float2 *)(backwardMotionPtr + backwardMotion_index);
+        __global float2 * forwardMap = (__global float2 *)(forwardMapPtr + forwardMap_index);
+        __global float2 * backwardMap = (__global float2 *)(backwardMapPtr + backwardMap_index);
 
-        forwardMapX[y * forwardMapX_step + x] = x + bx;
-        forwardMapY[y * forwardMapY_step + x] = y + by;
+        float2 basePoint = (float2)(x, y);
 
-        backwardMapX[y * backwardMapX_step + x] = x + fx;
-        backwardMapY[y * backwardMapY_step + x] = y + fy;
+        forwardMap[0] = basePoint + backwardMotion;
+        backwardMap[0] = basePoint + forwardMotion;
     }
 }
 
-__kernel void upscaleKernel(__global float* src,
-                            __global float* dst,
-                            int src_step,
-                            int dst_step,
-                            int src_row,
-                            int src_col,
-                            int scale,
-                            int channels
-                           )
+__kernel void upscale(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int scale)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < src_col && y < src_row)
+    if (x < src_cols && y < src_rows)
     {
-        if(channels == 1)
-        {
-            dst[y * scale * dst_step + x * scale] = src[y * src_step + x];
-        }
-        else
-        {
-            vstore4(vload4(0, src + y * channels * src_step + 4 * x), 0, dst + y * channels * scale * dst_step + 4 * x * scale);
-        }
+        int src_index = mad24(y, src_step, sz * x * cn + src_offset);
+        int dst_index = mad24(y * scale, dst_step, sz * x * scale * cn + dst_offset);
+
+        __global const float * src = (__global const float *)(srcptr + src_index);
+        __global float * dst = (__global float *)(dstptr + dst_index);
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            dst[c] = src[c];
     }
 }
 
 
-float diffSign(float a, float b)
+inline float diffSign1(float a, float b)
 {
     return a > b ? 1.0f : a < b ? -1.0f : 0.0f;
 }
 
-float4 diffSign4(float4 a, float4 b)
+inline float3 diffSign3(float3 a, float3 b)
 {
-    float4 pos;
+    float3 pos;
     pos.x = a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f;
     pos.y = a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f;
     pos.z = a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f;
-    pos.w = 0.0f;
     return pos;
 }
 
-__kernel void diffSignKernel(__global float* src1,
-                             __global float* src2,
-                             __global float* dst,
-                             int src1_row,
-                             int src1_col,
-                             int dst_step,
-                             int src1_step,
-                             int src2_step)
+__kernel void diffSign(__global const uchar * src1, int src1_step, int src1_offset,
+                       __global const uchar * src2, int src2_step, int src2_offset,
+                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < src1_col && y < src1_row)
-    {
-        dst[y * dst_step + x] = diffSign(src1[y * src1_step + x], src2[y * src2_step + x]);
-    }
+    if (x < cols && y < rows)
+        *(__global float *)(dst + mad24(y, dst_step, sz * x + dst_offset)) =
+            diffSign1(*(__global const float *)(src1 + mad24(y, src1_step, sz * x + src1_offset)),
+                      *(__global const float *)(src2 + mad24(y, src2_step, sz * x + src2_offset)));
 }
 
-__kernel void calcBtvRegularizationKernel(__global float* src,
-        __global float* dst,
-        int src_step,
-        int dst_step,
-        int src_row,
-        int src_col,
-        int ksize,
-        int channels,
-        __constant float* c_btvRegWeights
-                                         )
+__kernel void calcBtvRegularization(__global const uchar * src, int src_step, int src_offset,
+                                    __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                    int ksize, __constant float * c_btvRegWeights)
 {
     int x = get_global_id(0) + ksize;
     int y = get_global_id(1) + ksize;
 
-    if ((y < src_row - ksize) && (x < src_col - ksize))
+    if (y < dst_rows - ksize && x < dst_cols - ksize)
     {
-        if(channels == 1)
-        {
-            const float srcVal = src[y * src_step + x];
-            float dstVal = 0.0f;
+        src += src_offset;
 
-            for (int m = 0, count = 0; m <= ksize; ++m)
+#if cn == 1
+        const float srcVal = src_elem_at(src, y, src_step, x);
+        float dstVal = 0.0f;
+
+        for (int m = 0, count = 0; m <= ksize; ++m)
+            for (int l = ksize; l + m >= 0; --l, ++count)
             {
-                for (int l = ksize; l + m >= 0; --l, ++count)
-                {
-                    dstVal = dstVal + c_btvRegWeights[count] * (diffSign(srcVal, src[(y + m) * src_step + (x + l)]) - diffSign(src[(y - m) * src_step + (x - l)], srcVal));
-                }
+                dstVal += c_btvRegWeights[count] * (diffSign1(srcVal, src_elem_at(src, y + m, src_step, x + l))
+                    - diffSign1(src_elem_at(src, y - m, src_step, x - l), srcVal));
             }
-            dst[y * dst_step + x] = dstVal;
-        }
-        else
-        {
-            float4 srcVal = vload4(0, src + y * src_step + 4 * x);
-            float4 dstVal = 0.f;
 
-            for (int m = 0, count = 0; m <= ksize; ++m)
+        dst_elem_at(dst, y, dst_step, x) = dstVal;
+#elif cn == 3
+        __global const float * src0ptr = (__global const float *)(src + mad24(y, src_step, 3 * sz * x + src_offset));
+        float3 srcVal = (float3)(src0ptr[0], src0ptr[1], src0ptr[2]), dstVal = 0.f;
+
+        for (int m = 0, count = 0; m <= ksize; ++m)
+        {
+            for (int l = ksize; l + m >= 0; --l, ++count)
             {
-                for (int l = ksize; l + m >= 0; --l, ++count)
-                {
-                    float4 src1;
-                    src1.x = src[(y + m) * src_step + 4 * (x + l) + 0];
-                    src1.y = src[(y + m) * src_step + 4 * (x + l) + 1];
-                    src1.z = src[(y + m) * src_step + 4 * (x + l) + 2];
-                    src1.w = src[(y + m) * src_step + 4 * (x + l) + 3];
-
-                    float4 src2;
-                    src2.x = src[(y - m) * src_step + 4 * (x - l) + 0];
-                    src2.y = src[(y - m) * src_step + 4 * (x - l) + 1];
-                    src2.z = src[(y - m) * src_step + 4 * (x - l) + 2];
-                    src2.w = src[(y - m) * src_step + 4 * (x - l) + 3];
-
-                    dstVal = dstVal + c_btvRegWeights[count] * (diffSign4(srcVal, src1) - diffSign4(src2, srcVal));
-                }
+                __global const float * src1ptr = (__global const float *)(src + mad24(y + m, src_step, 3 * sz * (x + l) + src_offset));
+                __global const float * src2ptr = (__global const float *)(src + mad24(y - m, src_step, 3 * sz * (x - l) + src_offset));
+
+                float3 src1 = (float3)(src1ptr[0], src1ptr[1], src1ptr[2]);
+                float3 src2 = (float3)(src2ptr[0], src2ptr[1], src2ptr[2]);
+
+                dstVal += c_btvRegWeights[count] * (diffSign3(srcVal, src1) - diffSign3(src2, srcVal));
             }
-            vstore4(dstVal, 0, dst + y * dst_step + 4 * x);
         }
+
+        __global float * dstptr = (__global float *)(dst + mad24(y, dst_step, 3 * sz * x + dst_offset + 0));
+        dstptr[0] = dstVal.x;
+        dstptr[1] = dstVal.y;
+        dstptr[2] = dstVal.z;
+#else
+#error "Number of channels should be either 1 of 3"
+#endif
     }
 }
diff --git a/modules/superres/src/optical_flow.cpp b/modules/superres/src/optical_flow.cpp
index 30c27c2..2f77cd7 100644
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/core/opencl/ocl_defs.hpp"
 
 using namespace cv;
 using namespace cv::cuda;
@@ -61,21 +62,66 @@ namespace
         void collectGarbage();
 
     protected:
-        virtual void impl(const Mat& input0, const Mat& input1, OutputArray dst) = 0;
+        virtual void impl(InputArray input0, InputArray input1, OutputArray dst) = 0;
 
     private:
+        bool ocl_calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2);
+
         int work_type_;
+
+        // Mat
         Mat buf_[6];
         Mat flow_;
         Mat flows_[2];
+
+        // UMat
+        UMat ubuf_[6];
+        UMat uflow_;
+        std::vector<UMat> uflows_;
     };
 
-    CpuOpticalFlow::CpuOpticalFlow(int work_type) : work_type_(work_type)
+    CpuOpticalFlow::CpuOpticalFlow(int work_type) :
+        work_type_(work_type)
     {
     }
 
+    bool CpuOpticalFlow::ocl_calc(InputArray _frame0, InputArray _frame1, OutputArray _flow1, OutputArray _flow2)
+    {
+        UMat frame0 = arrGetUMat(_frame0, ubuf_[0]);
+        UMat frame1 = arrGetUMat(_frame1, ubuf_[1]);
+
+        CV_Assert( frame1.type() == frame0.type() );
+        CV_Assert( frame1.size() == frame0.size() );
+
+        UMat input0 = convertToType(frame0, work_type_, ubuf_[2], ubuf_[3]);
+        UMat input1 = convertToType(frame1, work_type_, ubuf_[4], ubuf_[5]);
+
+        if (!_flow2.needed())
+        {
+            impl(input0, input1, _flow1);
+            return true;
+        }
+
+        impl(input0, input1, uflow_);
+
+        if (!_flow2.needed())
+            arrCopy(uflow_, _flow1);
+        else
+        {
+            split(uflow_, uflows_);
+
+            arrCopy(uflows_[0], _flow1);
+            arrCopy(uflows_[1], _flow2);
+        }
+
+        return true;
+    }
+
     void CpuOpticalFlow::calc(InputArray _frame0, InputArray _frame1, OutputArray _flow1, OutputArray _flow2)
     {
+        CV_OCL_RUN(_flow1.isUMat() && (_flow2.isUMat() || !_flow2.needed()),
+                   ocl_calc(_frame0, _frame1, _flow1, _flow2))
+
         Mat frame0 = arrGetMat(_frame0, buf_[0]);
         Mat frame1 = arrGetMat(_frame1, buf_[1]);
 
@@ -94,9 +140,7 @@ namespace
         impl(input0, input1, flow_);
 
         if (!_flow2.needed())
-        {
             arrCopy(flow_, _flow1);
-        }
         else
         {
             split(flow_, flows_);
@@ -108,11 +152,19 @@ namespace
 
     void CpuOpticalFlow::collectGarbage()
     {
+        // Mat
         for (int i = 0; i < 6; ++i)
             buf_[i].release();
         flow_.release();
         flows_[0].release();
         flows_[1].release();
+
+        // UMat
+        for (int i = 0; i < 6; ++i)
+            ubuf_[i].release();
+        uflow_.release();
+        uflows_[0].release();
+        uflows_[1].release();
     }
 }
 
@@ -129,7 +181,7 @@ namespace
         Farneback();
 
     protected:
-        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+        void impl(InputArray input0, InputArray input1, OutputArray dst);
 
     private:
         double pyrScale_;
@@ -161,7 +213,7 @@ namespace
         flags_ = 0;
     }
 
-    void Farneback::impl(const Mat& input0, const Mat& input1, OutputArray dst)
+    void Farneback::impl(InputArray input0, InputArray input1, OutputArray dst)
     {
         calcOpticalFlowFarneback(input0, input1, (InputOutputArray)dst, pyrScale_,
                                  numLevels_, winSize_, numIters_,
@@ -187,7 +239,7 @@ namespace
         Simple();
 
     protected:
-        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+        void impl(InputArray input0, InputArray input1, OutputArray dst);
 
     private:
         int layers_;
@@ -237,11 +289,9 @@ namespace
         speedUpThr_ = 10;
     }
 
-    void Simple::impl(const Mat& _input0, const Mat& _input1, OutputArray dst)
+    void Simple::impl(InputArray _input0, InputArray _input1, OutputArray _dst)
     {
-        Mat input0 = _input0;
-        Mat input1 = _input1;
-        calcOpticalFlowSF(input0, input1, dst.getMatRef(),
+        calcOpticalFlowSF(_input0, _input1, _dst,
                           layers_,
                           averagingBlockSize_,
                           maxFlow_,
@@ -278,7 +328,7 @@ namespace
         void collectGarbage();
 
     protected:
-        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+        void impl(InputArray input0, InputArray input1, OutputArray dst);
 
     private:
         double tau_;
@@ -316,7 +366,7 @@ namespace
         useInitialFlow_ = alg_->getBool("useInitialFlow");
     }
 
-    void DualTVL1::impl(const Mat& input0, const Mat& input1, OutputArray dst)
+    void DualTVL1::impl(InputArray input0, InputArray input1, OutputArray dst)
     {
         alg_->set("tau", tau_);
         alg_->set("lambda", lambda_);
@@ -720,269 +770,3 @@ Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1_CUDA()
 }
 
 #endif // HAVE_OPENCV_CUDAOPTFLOW
-#ifdef HAVE_OPENCV_OCL
-
-namespace
-{
-    class oclOpticalFlow : public DenseOpticalFlowExt
-    {
-    public:
-        explicit oclOpticalFlow(int work_type);
-
-        void calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2);
-        void collectGarbage();
-
-    protected:
-        virtual void impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2) = 0;
-
-    private:
-        int work_type_;
-        cv::ocl::oclMat buf_[6];
-        cv::ocl::oclMat u_, v_, flow_;
-    };
-
-    oclOpticalFlow::oclOpticalFlow(int work_type) : work_type_(work_type)
-    {
-    }
-
-    void oclOpticalFlow::calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2)
-    {
-        ocl::oclMat& _frame0 = ocl::getOclMatRef(frame0);
-        ocl::oclMat& _frame1 = ocl::getOclMatRef(frame1);
-        ocl::oclMat& _flow1  = ocl::getOclMatRef(flow1);
-        ocl::oclMat& _flow2  = ocl::getOclMatRef(flow2);
-
-        CV_Assert( _frame1.type() == _frame0.type() );
-        CV_Assert( _frame1.size() == _frame0.size() );
-
-        cv::ocl::oclMat input0_ = convertToType(_frame0, work_type_, buf_[2], buf_[3]);
-        cv::ocl::oclMat input1_ = convertToType(_frame1, work_type_, buf_[4], buf_[5]);
-
-        impl(input0_, input1_, u_, v_);//go to tvl1 algorithm
-
-        u_.copyTo(_flow1);
-        v_.copyTo(_flow2);
-    }
-
-    void oclOpticalFlow::collectGarbage()
-    {
-        for (int i = 0; i < 6; ++i)
-            buf_[i].release();
-        u_.release();
-        v_.release();
-        flow_.release();
-    }
-}
-///////////////////////////////////////////////////////////////////
-// PyrLK_OCL
-
-namespace
-{
-    class PyrLK_OCL : public oclOpticalFlow
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        PyrLK_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void impl(const ocl::oclMat& input0, const ocl::oclMat& input1, ocl::oclMat& dst1, ocl::oclMat& dst2);
-
-    private:
-        int winSize_;
-        int maxLevel_;
-        int iterations_;
-
-        ocl::PyrLKOpticalFlow alg_;
-    };
-
-    CV_INIT_ALGORITHM(PyrLK_OCL, "DenseOpticalFlowExt.PyrLK_OCL",
-        obj.info()->addParam(obj, "winSize", obj.winSize_);
-    obj.info()->addParam(obj, "maxLevel", obj.maxLevel_);
-    obj.info()->addParam(obj, "iterations", obj.iterations_))
-
-    PyrLK_OCL::PyrLK_OCL() : oclOpticalFlow(CV_8UC1)
-    {
-        winSize_ = alg_.winSize.width;
-        maxLevel_ = alg_.maxLevel;
-        iterations_ = alg_.iters;
-    }
-
-    void PyrLK_OCL::impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2)
-    {
-        alg_.winSize.width = winSize_;
-        alg_.winSize.height = winSize_;
-        alg_.maxLevel = maxLevel_;
-        alg_.iters = iterations_;
-
-        alg_.dense(input0, input1, dst1, dst2);
-    }
-
-    void PyrLK_OCL::collectGarbage()
-    {
-        alg_.releaseMemory();
-        oclOpticalFlow::collectGarbage();
-    }
-}
-
-Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_PyrLK_OCL()
-{
-    return makePtr<PyrLK_OCL>();
-}
-
-///////////////////////////////////////////////////////////////////
-// DualTVL1_OCL
-
-namespace
-{
-    class DualTVL1_OCL : public oclOpticalFlow
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        DualTVL1_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2);
-
-    private:
-        double tau_;
-        double lambda_;
-        double theta_;
-        int nscales_;
-        int warps_;
-        double epsilon_;
-        int iterations_;
-        bool useInitialFlow_;
-
-        ocl::OpticalFlowDual_TVL1_OCL alg_;
-    };
-
-    CV_INIT_ALGORITHM(DualTVL1_OCL, "DenseOpticalFlowExt.DualTVL1_OCL",
-    obj.info()->addParam(obj, "tau", obj.tau_);
-    obj.info()->addParam(obj, "lambda", obj.lambda_);
-    obj.info()->addParam(obj, "theta", obj.theta_);
-    obj.info()->addParam(obj, "nscales", obj.nscales_);
-    obj.info()->addParam(obj, "warps", obj.warps_);
-    obj.info()->addParam(obj, "epsilon", obj.epsilon_);
-    obj.info()->addParam(obj, "iterations", obj.iterations_);
-    obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_))
-
-    DualTVL1_OCL::DualTVL1_OCL() : oclOpticalFlow(CV_8UC1)
-    {
-        tau_ = alg_.tau;
-        lambda_ = alg_.lambda;
-        theta_ = alg_.theta;
-        nscales_ = alg_.nscales;
-        warps_ = alg_.warps;
-        epsilon_ = alg_.epsilon;
-        iterations_ = alg_.iterations;
-        useInitialFlow_ = alg_.useInitialFlow;
-    }
-
-    void DualTVL1_OCL::impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2)
-    {
-        alg_.tau = tau_;
-        alg_.lambda = lambda_;
-        alg_.theta = theta_;
-        alg_.nscales = nscales_;
-        alg_.warps = warps_;
-        alg_.epsilon = epsilon_;
-        alg_.iterations = iterations_;
-        alg_.useInitialFlow = useInitialFlow_;
-
-        alg_(input0, input1, dst1, dst2);
-
-    }
-
-    void DualTVL1_OCL::collectGarbage()
-    {
-        alg_.collectGarbage();
-        oclOpticalFlow::collectGarbage();
-    }
-}
-
-Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1_OCL()
-{
-    return makePtr<DualTVL1_OCL>();
-}
-
-///////////////////////////////////////////////////////////////////
-// FarneBack
-
-namespace
-{
-    class FarneBack_OCL : public oclOpticalFlow
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        FarneBack_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2);
-
-    private:
-        double pyrScale_;
-        int numLevels_;
-        int winSize_;
-        int numIters_;
-        int polyN_;
-        double polySigma_;
-        int flags_;
-
-        ocl::FarnebackOpticalFlow alg_;
-    };
-
-    CV_INIT_ALGORITHM(FarneBack_OCL, "DenseOpticalFlowExt.FarneBack_OCL",
-        obj.info()->addParam(obj, "pyrScale", obj.pyrScale_);
-    obj.info()->addParam(obj, "numLevels", obj.numLevels_);
-    obj.info()->addParam(obj, "winSize", obj.winSize_);
-    obj.info()->addParam(obj, "numIters", obj.numIters_);
-    obj.info()->addParam(obj, "polyN", obj.polyN_);
-    obj.info()->addParam(obj, "polySigma", obj.polySigma_);
-    obj.info()->addParam(obj, "flags", obj.flags_))
-
-    FarneBack_OCL::FarneBack_OCL() : oclOpticalFlow(CV_8UC1)
-    {
-        pyrScale_ = alg_.pyrScale;
-        numLevels_ = alg_.numLevels;
-        winSize_ = alg_.winSize;
-        numIters_ = alg_.numIters;
-        polyN_ = alg_.polyN;
-        polySigma_ = alg_.polySigma;
-        flags_ = alg_.flags;
-    }
-
-    void FarneBack_OCL::impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2)
-    {
-        alg_.pyrScale = pyrScale_;
-        alg_.numLevels = numLevels_;
-        alg_.winSize = winSize_;
-        alg_.numIters = numIters_;
-        alg_.polyN = polyN_;
-        alg_.polySigma = polySigma_;
-        alg_.flags = flags_;
-
-        alg_(input0, input1, dst1, dst2);
-    }
-
-    void FarneBack_OCL::collectGarbage()
-    {
-        alg_.releaseMemory();
-        oclOpticalFlow::collectGarbage();
-    }
-}
-
-Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Farneback_OCL()
-{
-    return makePtr<FarneBack_OCL>();
-}
-
-#endif
diff --git a/modules/superres/src/precomp.hpp b/modules/superres/src/precomp.hpp
index 0681bfa..c3aeb66 100644
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
@@ -82,10 +82,6 @@
 #  include "opencv2/cudacodec.hpp"
 #endif
 
-#ifdef HAVE_OPENCV_OCL
-    #include "opencv2/ocl/private/util.hpp"
-#endif
-
 #ifdef HAVE_OPENCV_HIGHGUI
     #include "opencv2/highgui.hpp"
 #endif
diff --git a/modules/superres/src/super_resolution.cpp b/modules/superres/src/super_resolution.cpp
index 031a59b..215416d 100644
--- a/modules/superres/src/super_resolution.cpp
+++ b/modules/superres/src/super_resolution.cpp
@@ -54,16 +54,20 @@ cv::superres::SuperResolution::SuperResolution()
 {
     frameSource_ = createFrameSource_Empty();
     firstCall_ = true;
+    isUmat_ = false;
 }
 
 void cv::superres::SuperResolution::setInput(const Ptr<FrameSource>& frameSource)
 {
     frameSource_ = frameSource;
     firstCall_ = true;
+    isUmat_ = false;
 }
 
 void cv::superres::SuperResolution::nextFrame(OutputArray frame)
 {
+    isUmat_ = frame.isUMat();
+
     if (firstCall_)
     {
         initImpl(frameSource_);
@@ -77,6 +81,7 @@ void cv::superres::SuperResolution::reset()
 {
     frameSource_->reset();
     firstCall_ = true;
+    isUmat_ = false;
 }
 
 void cv::superres::SuperResolution::collectGarbage()
diff --git a/modules/superres/test/test_superres.cpp b/modules/superres/test/test_superres.cpp
index 92d51fd..980c8ed 100644
--- a/modules/superres/test/test_superres.cpp
+++ b/modules/superres/test/test_superres.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
 
 class AllignedFrameSource : public cv::superres::FrameSource
 {
@@ -52,6 +53,7 @@ public:
 
 private:
     cv::Ptr<cv::superres::FrameSource> base_;
+
     cv::Mat origFrame_;
     int scale_;
 };
@@ -67,9 +69,7 @@ void AllignedFrameSource::nextFrame(cv::OutputArray frame)
     base_->nextFrame(origFrame_);
 
     if (origFrame_.rows % scale_ == 0 && origFrame_.cols % scale_ == 0)
-    {
         cv::superres::arrCopy(origFrame_, frame);
-    }
     else
     {
         cv::Rect ROI(0, 0, (origFrame_.cols / scale_) * scale_, (origFrame_.rows / scale_) * scale_);
@@ -92,6 +92,7 @@ public:
 
 private:
     cv::Ptr<cv::superres::FrameSource> base_;
+
     cv::Mat origFrame_;
     cv::Mat blurred_;
     cv::Mat deg_;
@@ -104,28 +105,25 @@ DegradeFrameSource::DegradeFrameSource(const cv::Ptr<cv::superres::FrameSource>&
     CV_Assert( base_ );
 }
 
-void addGaussNoise(cv::Mat& image, double sigma)
+static void addGaussNoise(cv::OutputArray _image, double sigma)
 {
-    cv::Mat noise(image.size(), CV_32FC(image.channels()));
+    int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    cv::Mat noise(_image.size(), CV_32FC(cn));
     cvtest::TS::ptr()->get_rng().fill(noise, cv::RNG::NORMAL, 0.0, sigma);
 
-    cv::addWeighted(image, 1.0, noise, 1.0, 0.0, image, image.depth());
+    cv::addWeighted(_image, 1.0, noise, 1.0, 0.0, _image, depth);
 }
 
-void addSpikeNoise(cv::Mat& image, int frequency)
+static void addSpikeNoise(cv::OutputArray _image, int frequency)
 {
-    cv::Mat_<uchar> mask(image.size(), 0);
+    cv::Mat_<uchar> mask(_image.size(), 0);
 
     for (int y = 0; y < mask.rows; ++y)
-    {
         for (int x = 0; x < mask.cols; ++x)
-        {
             if (cvtest::TS::ptr()->get_rng().uniform(0, frequency) < 1)
                 mask(y, x) = 255;
-        }
-    }
 
-    image.setTo(cv::Scalar::all(255), mask);
+    _image.setTo(cv::Scalar::all(255), mask);
 }
 
 void DegradeFrameSource::nextFrame(cv::OutputArray frame)
@@ -146,7 +144,7 @@ void DegradeFrameSource::reset()
     base_->reset();
 }
 
-double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
+double MSSIM(cv::InputArray _i1, cv::InputArray _i2)
 {
     const double C1 = 6.5025;
     const double C2 = 58.5225;
@@ -154,8 +152,8 @@ double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
     const int depth = CV_32F;
 
     cv::Mat I1, I2;
-    i1.convertTo(I1, depth);
-    i2.convertTo(I2, depth);
+    _i1.getMat().convertTo(I1, depth);
+    _i2.getMat().convertTo(I2, depth);
 
     cv::Mat I2_2  = I2.mul(I2); // I2^2
     cv::Mat I1_2  = I1.mul(I1); // I1^2
@@ -201,7 +199,7 @@ double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
     // mssim = average of ssim map
     cv::Scalar mssim = cv::mean(ssim_map);
 
-    if (i1.channels() == 1)
+    if (_i1.channels() == 1)
         return mssim[0];
 
     return (mssim[0] + mssim[1] + mssim[3]) / 3;
@@ -210,9 +208,11 @@ double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
 class SuperResolution : public testing::Test
 {
 public:
+    template <typename T>
     void RunTest(cv::Ptr<cv::superres::SuperResolution> superRes);
 };
 
+template <typename T>
 void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
 {
     const std::string inputVideoName = cvtest::TS::ptr()->get_data_path() + "car.avi";
@@ -245,7 +245,8 @@ void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
     double srAvgMSSIM = 0.0;
     const int count = 10;
 
-    cv::Mat goldFrame, superResFrame;
+    cv::Mat goldFrame;
+    T superResFrame;
     for (int i = 0; i < count; ++i)
     {
         goldSource->nextFrame(goldFrame);
@@ -266,24 +267,28 @@ void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
 
 TEST_F(SuperResolution, BTVL1)
 {
-    RunTest(cv::superres::createSuperResolution_BTVL1());
+    RunTest<cv::Mat>(cv::superres::createSuperResolution_BTVL1());
 }
 
 #if defined(HAVE_CUDA) && defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING) && defined(HAVE_OPENCV_CUDAFILTERS)
 
 TEST_F(SuperResolution, BTVL1_CUDA)
 {
-    RunTest(cv::superres::createSuperResolution_BTVL1_CUDA());
+    RunTest<cv::Mat>(cv::superres::createSuperResolution_BTVL1_CUDA());
 }
 
 #endif
 
-#if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL)
+#ifdef HAVE_OPENCL
 
-TEST_F(SuperResolution, BTVL1_OCL)
+namespace cvtest {
+namespace ocl {
+
+OCL_TEST_F(SuperResolution, BTVL1)
 {
-    if (cv::ocl::useOpenCL())
-        RunTest(cv::superres::createSuperResolution_BTVL1_OCL());
+    RunTest<cv::UMat>(cv::superres::createSuperResolution_BTVL1());
 }
 
+} } // namespace cvtest::ocl
+
 #endif
diff --git a/modules/ts/include/opencv2/ts/ocl_perf.hpp b/modules/ts/include/opencv2/ts/ocl_perf.hpp
index 37d2886..8a92faa 100644
--- a/modules/ts/include/opencv2/ts/ocl_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_perf.hpp
@@ -99,10 +99,14 @@ using std::tr1::tuple;
 #define OCL_TEST_CYCLE() \
     for (cvtest::ocl::perf::safeFinish(); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer())
 
+#define OCL_TEST_CYCLE_N(n) \
+    for(declare.iterations(n), cvtest::ocl::perf::safeFinish(); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer())
+
 #define OCL_TEST_CYCLE_MULTIRUN(runsNum) \
     for (declare.runs(runsNum), cvtest::ocl::perf::safeFinish(); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer()) \
         for (int r = 0; r < runsNum; cvtest::ocl::perf::safeFinish(), ++r)
 
+
 namespace perf {
 
 // Check for current device limitation