From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Tue, 5 Nov 2013 12:38:23 +0000 (+0400)
Subject: Merge remote-tracking branch 'origin/2.4' into merge-2.4
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3703^2~2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9c83f6c4fbc4eeeaeb498425a2b28a5df0ca0d97;p=platform%2Fupstream%2Fopencv.git

Merge remote-tracking branch 'origin/2.4' into merge-2.4

Conflicts:
	cmake/OpenCVDetectCUDA.cmake
	modules/core/include/opencv2/core/version.hpp
	modules/cudacodec/src/ffmpeg_video_source.cpp
	modules/gpu/src/video_writer.cpp
	modules/highgui/test/test_ffmpeg.cpp
	modules/highgui/test/test_video_io.cpp
	modules/highgui/test/test_video_pos.cpp
	modules/ocl/include/opencv2/ocl/ocl.hpp
	modules/ocl/include/opencv2/ocl/private/util.hpp
	modules/ocl/src/arithm.cpp
	modules/ocl/src/blend.cpp
	modules/ocl/src/canny.cpp
	modules/ocl/src/cl_operations.cpp
	modules/ocl/src/filtering.cpp
	modules/ocl/src/haar.cpp
	modules/ocl/src/imgproc.cpp
	modules/ocl/src/kmeans.cpp
	modules/ocl/src/moments.cpp
	modules/ocl/src/safe_call.hpp
	modules/ocl/src/split_merge.cpp
	modules/ocl/test/test_moments.cpp
	samples/ocl/squares.cpp
---

9c83f6c4fbc4eeeaeb498425a2b28a5df0ca0d97
diff --cc CMakeLists.txt
index 73def95,3978aad..324d069
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -31,7 -28,21 +31,11 @@@ else(NOT CMAKE_TOOLCHAIN_FILE
    set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
  endif(NOT CMAKE_TOOLCHAIN_FILE)
  
 -# --------------------------------------------------------------
 -# Top level OpenCV project
 -# --------------------------------------------------------------
 -if(CMAKE_GENERATOR MATCHES Xcode AND XCODE_VERSION VERSION_GREATER 4.3)
 -  cmake_minimum_required(VERSION 2.8.8)
 -elseif(IOS)
 -  cmake_minimum_required(VERSION 2.8.0)
 -else()
 -  cmake_minimum_required(VERSION 2.6.3)
 -endif()
  
+ if(POLICY CMP0017)
+   cmake_policy(SET CMP0017 NEW)
+ endif()
+ 
  if(POLICY CMP0022)
    cmake_policy(SET CMP0022 OLD)
  endif()
diff --cc cmake/OpenCVDetectCUDA.cmake
index 173bee3,156d90e..87dc4d1
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@@ -8,8 -13,24 +8,24 @@@ if(CMAKE_COMPILER_IS_GNUCXX AND NOT APP
    return()
  endif()
  
+ set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+ 
+ foreach(var INCLUDE LIBRARY PROGRAM)
+   set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}")
+ endforeach()
+ 
+ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+ 
 -find_package(CUDA 4.2 QUIET)
 +find_package(CUDA "${MIN_VER_CUDA}" QUIET)
  
+ foreach(var INCLUDE LIBRARY PROGRAM)
+   set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}")
+ endforeach()
+ 
+ list(REMOVE_AT CMAKE_MODULE_PATH 0)
+ 
  if(CUDA_FOUND)
    set(HAVE_CUDA 1)
  
@@@ -21,52 -42,8 +37,11 @@@
      set(HAVE_CUBLAS 1)
    endif()
  
-   if(${CUDA_VERSION} VERSION_LESS "5.5")
-     find_cuda_helper_libs(npp)
-   else()
-     # hack for CUDA 5.5
-     if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
-       unset(CUDA_TOOLKIT_INCLUDE CACHE)
-       unset(CUDA_CUDART_LIBRARY CACHE)
-       unset(CUDA_cublas_LIBRARY CACHE)
-       unset(CUDA_cufft_LIBRARY CACHE)
-       unset(CUDA_npp_LIBRARY CACHE)
- 
-       if(SOFTFP)
-         set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabi")
-       else()
-         set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-       endif()
- 
-       set(CUDA_TOOLKIT_INCLUDE "${cuda_arm_path}/include" CACHE PATH "include path")
-       set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
- 
-       set(cuda_arm_library_path "${cuda_arm_path}/lib")
- 
-       set(CUDA_CUDART_LIBRARY "${cuda_arm_library_path}/libcudart.so" CACHE FILEPATH "cudart library")
-       set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
-       set(CUDA_cublas_LIBRARY "${cuda_arm_library_path}/libcublas.so" CACHE FILEPATH "cublas library")
-       set(CUDA_cufft_LIBRARY "${cuda_arm_library_path}/libcufft.so" CACHE FILEPATH "cufft library")
-       set(CUDA_nppc_LIBRARY "${cuda_arm_library_path}/libnppc.so" CACHE FILEPATH "nppc library")
-       set(CUDA_nppi_LIBRARY "${cuda_arm_library_path}/libnppi.so" CACHE FILEPATH "nppi library")
-       set(CUDA_npps_LIBRARY "${cuda_arm_library_path}/libnpps.so" CACHE FILEPATH "npps library")
-       set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library")
-     else()
-       unset(CUDA_npp_LIBRARY CACHE)
- 
-       find_cuda_helper_libs(nppc)
-       find_cuda_helper_libs(nppi)
-       find_cuda_helper_libs(npps)
- 
-       set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library")
-     endif()
-   endif()
- 
    if(WITH_NVCUVID)
      find_cuda_helper_libs(nvcuvid)
 +    if(WIN32)
 +      find_cuda_helper_libs(nvcuvenc)
 +    endif()
      set(HAVE_NVCUVID 1)
    endif()
  
diff --cc modules/highgui/test/test_ffmpeg.cpp
index 01afa83,468fe77..85ee0be
--- a/modules/highgui/test/test_ffmpeg.cpp
+++ b/modules/highgui/test/test_ffmpeg.cpp
@@@ -84,64 -84,63 +84,63 @@@ public
  
          for (size_t j = 0; j < n; ++j)
          {
-         int tag = tags[j];
-         stringstream s;
-         s << tag;
+             int tag = tags[j];
+             stringstream s;
+             s << tag;
  
-         const string filename = "output_"+s.str()+".avi";
+             const string filename = "output_"+s.str()+".avi";
  
-         try
-         {
-             double fps = fps0;
-             Size frame_s = Size(img_c, img_r);
- 
-             if( tag == VideoWriter::fourcc('H', '2', '6', '1') )
-                 frame_s = Size(352, 288);
-             else if( tag == VideoWriter::fourcc('H', '2', '6', '3') )
-                 frame_s = Size(704, 576);
-             /*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') ||
-                      tag == CV_FOURCC('j', 'p', 'e', 'g') )
-                 frame_s = Size(1920, 1080);*/
- 
-             if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') )
+             try
              {
-                 frame_s = Size(720, 576);
-                 fps = 25;
-             }
- 
-             VideoWriter writer(filename, tag, fps, frame_s);
+                 double fps = fps0;
+                 Size frame_s = Size(img_c, img_r);
+ 
 -                if( tag == CV_FOURCC('H', '2', '6', '1') )
++                if( tag == VideoWriter::fourcc('H', '2', '6', '1') )
+                     frame_s = Size(352, 288);
 -                else if( tag == CV_FOURCC('H', '2', '6', '3') )
++                else if( tag == VideoWriter::fourcc('H', '2', '6', '3') )
+                     frame_s = Size(704, 576);
+                 /*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') ||
+                          tag == CV_FOURCC('j', 'p', 'e', 'g') )
+                     frame_s = Size(1920, 1080);*/
+ 
 -                if( tag == CV_FOURCC('M', 'P', 'E', 'G') )
++                if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') )
+                 {
+                     frame_s = Size(720, 576);
+                     fps = 25;
+                 }
  
-             if (writer.isOpened() == false)
-             {
-                 ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str());
-                 ts->printf(ts->LOG, "Codec id: %d   Codec tag: %c%c%c%c\n", j,
-                            tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255);
-                 ts->printf(ts->LOG, "Error: cannot create video file.");
-                 ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
-             }
-             else
-             {
-                 Mat img(frame_s, CV_8UC3, Scalar::all(0));
-                 const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec));
+                 VideoWriter writer(filename, tag, fps, frame_s);
  
-                 for (int i = 0 ; i < static_cast<int>(fps * time_sec); i++ )
+                 if (writer.isOpened() == false)
                  {
-                     //circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2);
-                     rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)),
-                               Scalar::all(255 * (1.0 - static_cast<double>(i) / (fps * time_sec * 2) )), -1);
-                     writer << img;
+                     ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str());
+                     ts->printf(ts->LOG, "Codec id: %d   Codec tag: %c%c%c%c\n", j,
+                                tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255);
+                     ts->printf(ts->LOG, "Error: cannot create video file.");
+                     ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
+                 }
+                 else
+                 {
+                     Mat img(frame_s, CV_8UC3, Scalar::all(0));
+                     const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec));
+ 
+                     for (int i = 0 ; i < static_cast<int>(fps * time_sec); i++ )
+                     {
+                         //circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2);
+                         rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)),
+                                   Scalar::all(255 * (1.0 - static_cast<double>(i) / (fps * time_sec * 2) )), -1);
+                         writer << img;
+                     }
+ 
+                     if (!created) created = true;
+                     else remove(filename.c_str());
                  }
- 
-                 if (!created) created = true;
-                 else remove(filename.c_str());
              }
-         }
-         catch(...)
-         {
-             ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
-         }
-         ts->set_failed_test_info(cvtest::TS::OK);
- 
+             catch(...)
+             {
+                 ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
+             }
+             ts->set_failed_test_info(cvtest::TS::OK);
          }
      }
  };
diff --cc modules/ocl/doc/image_filtering.rst
index bf46802,cbec29b..e020dc7
--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@@@ -459,37 -453,12 +453,41 @@@ Returns voi
  
      :param scale: The optional scale factor for the computed Laplacian values (by default, no scaling is applied
  
+     :param delta: Optional delta value that is added to the results prior to storing them in  ``dst`` . Supported value is 0 only.
+ 
+     :param bordertype: Pixel extrapolation method.
+ 
  The function calculates the Laplacian of the source image by adding up the second x and y derivatives calculated using the Sobel operator.
  
 +ocl::ConvolveBuf
 +----------------
 +.. ocv:struct:: ocl::ConvolveBuf
 +
 +Class providing a memory buffer for :ocv:func:`ocl::convolve` function, plus it allows to adjust some specific parameters. ::
 +
 +    struct CV_EXPORTS ConvolveBuf
 +    {
 +        Size result_size;
 +        Size block_size;
 +        Size user_block_size;
 +        Size dft_size;
 +        int spect_len;
 +
 +        oclMat image_spect, templ_spect, result_spect;
 +        oclMat image_block, templ_block, result_data;
 +
 +        void create(Size image_size, Size templ_size);
 +        static Size estimateBlockSize(Size result_size, Size templ_size);
 +    };
 +
 +You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
 +
 +ocl::ConvolveBuf::create
 +------------------------
 +.. ocv:function:: ocl::ConvolveBuf::create(Size image_size, Size templ_size)
 +
 +Constructs a buffer for :ocv:func:`ocl::convolve` function with respective arguments.
 +
  ocl::convolve
  ------------------
  Returns void
diff --cc modules/ocl/include/opencv2/ocl.hpp
index 3f0fb29,0000000..b8c26b2
mode 100644,000000..100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@@ -1,2070 -1,0 +1,2076 @@@
 +/*M///////////////////////////////////////////////////////////////////////////////////////
 +//
 +//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 +//
 +//  By downloading, copying, installing or using the software you agree to this license.
 +//  If you do not agree to this license, do not download, install,
 +//  copy or use the software.
 +//
 +//
 +//                           License Agreement
 +//                For Open Source Computer Vision Library
 +//
 +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 +// Third party copyrights are property of their respective owners.
 +//
 +// Redistribution and use in source and binary forms, with or without modification,
 +// are permitted provided that the following conditions are met:
 +//
 +//   * Redistribution's of source code must retain the above copyright notice,
 +//     this list of conditions and the following disclaimer.
 +//
 +//   * Redistribution's in binary form must reproduce the above copyright notice,
 +//     this list of conditions and the following disclaimer in the documentation
- //     and/or other oclMaterials provided with the distribution.
++//     and/or other materials provided with the distribution.
 +//
 +//   * The name of the copyright holders may not be used to endorse or promote products
 +//     derived from this software without specific prior written permission.
 +//
 +// This software is provided by the copyright holders and contributors "as is" and
 +// any express or implied warranties, including, but not limited to, the implied
 +// warranties of merchantability and fitness for a particular purpose are disclaimed.
 +// In no event shall the Intel Corporation or contributors be liable for any direct,
 +// indirect, incidental, special, exemplary, or consequential damages
 +// (including, but not limited to, procurement of substitute goods or services;
 +// loss of use, data, or profits; or business interruption) however caused
 +// and on any theory of liability, whether in contract, strict liability,
 +// or tort (including negligence or otherwise) arising in any way out of
 +// the use of this software, even if advised of the possibility of such damage.
 +//
 +//M*/
 +
 +#ifndef __OPENCV_OCL_HPP__
 +#define __OPENCV_OCL_HPP__
 +
 +#include <memory>
 +#include <vector>
 +
 +#include "opencv2/core.hpp"
 +#include "opencv2/imgproc.hpp"
 +#include "opencv2/objdetect.hpp"
 +#include "opencv2/ml.hpp"
 +
 +namespace cv
 +{
 +    namespace ocl
 +    {
 +        enum DeviceType
 +        {
 +            CVCL_DEVICE_TYPE_DEFAULT     = (1 << 0),
 +            CVCL_DEVICE_TYPE_CPU         = (1 << 1),
 +            CVCL_DEVICE_TYPE_GPU         = (1 << 2),
 +            CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3),
 +            //CVCL_DEVICE_TYPE_CUSTOM      = (1 << 4)
 +            CVCL_DEVICE_TYPE_ALL         = 0xFFFFFFFF
 +        };
 +
 +        enum DevMemRW
 +        {
 +            DEVICE_MEM_R_W = 0,
 +            DEVICE_MEM_R_ONLY,
 +            DEVICE_MEM_W_ONLY
 +        };
 +
 +        enum DevMemType
 +        {
 +            DEVICE_MEM_DEFAULT = 0,
 +            DEVICE_MEM_AHP,         //alloc host pointer
 +            DEVICE_MEM_UHP,         //use host pointer
 +            DEVICE_MEM_CHP,         //copy host pointer
 +            DEVICE_MEM_PM           //persistent memory
 +        };
 +
 +        // these classes contain OpenCL runtime information
 +
 +        struct PlatformInfo;
 +
 +        struct DeviceInfo
 +        {
 +        public:
 +            int _id; // reserved, don't use it
 +
 +            DeviceType deviceType;
 +            std::string deviceProfile;
 +            std::string deviceVersion;
 +            std::string deviceName;
 +            std::string deviceVendor;
 +            int deviceVendorId;
 +            std::string deviceDriverVersion;
 +            std::string deviceExtensions;
 +
 +            size_t maxWorkGroupSize;
 +            std::vector<size_t> maxWorkItemSizes;
 +            int maxComputeUnits;
 +            size_t localMemorySize;
 +            size_t maxMemAllocSize;
 +
 +            int deviceVersionMajor;
 +            int deviceVersionMinor;
 +
 +            bool haveDoubleSupport;
 +            bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
++            bool isIntelDevice;
 +
 +            std::string compilationExtraOptions;
 +
 +            const PlatformInfo* platform;
 +
 +            DeviceInfo();
 +        };
 +
 +        struct PlatformInfo
 +        {
 +            int _id; // reserved, don't use it
 +
 +            std::string platformProfile;
 +            std::string platformVersion;
 +            std::string platformName;
 +            std::string platformVendor;
 +            std::string platformExtensons;
 +
 +            int platformVersionMajor;
 +            int platformVersionMinor;
 +
 +            std::vector<const DeviceInfo*> devices;
 +
 +            PlatformInfo();
 +        };
 +
 +        //////////////////////////////// Initialization & Info ////////////////////////
 +        typedef std::vector<const PlatformInfo*> PlatformsInfo;
 +
 +        CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms);
 +
 +        typedef std::vector<const DeviceInfo*> DevicesInfo;
 +
 +        CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU,
 +                const PlatformInfo* platform = NULL);
 +
 +        // set device you want to use
 +        CV_EXPORTS void setDevice(const DeviceInfo* info);
 +
 +        enum FEATURE_TYPE
 +        {
 +            FEATURE_CL_DOUBLE = 1,
 +            FEATURE_CL_UNIFIED_MEM,
-             FEATURE_CL_VER_1_2
++            FEATURE_CL_VER_1_2,
++            FEATURE_CL_INTEL_DEVICE
 +        };
 +
 +        // Represents OpenCL context, interface
 +        class CV_EXPORTS Context
 +        {
 +        protected:
 +            Context() { }
 +            ~Context() { }
 +        public:
 +            static Context *getContext();
 +
 +            bool supportsFeature(FEATURE_TYPE featureType) const;
 +            const DeviceInfo& getDeviceInfo() const;
 +
 +            const void* getOpenCLContextPtr() const;
 +            const void* getOpenCLCommandQueuePtr() const;
 +            const void* getOpenCLDeviceIDPtr() const;
 +        };
 +
 +        inline const void *getClContextPtr()
 +        {
 +            return Context::getContext()->getOpenCLContextPtr();
 +        }
 +
 +        inline const void *getClCommandQueuePtr()
 +        {
 +            return Context::getContext()->getOpenCLCommandQueuePtr();
 +        }
 +
 +        CV_EXPORTS bool supportsFeature(FEATURE_TYPE featureType);
 +
 +        CV_EXPORTS void finish();
 +
 +        enum BINARY_CACHE_MODE
 +        {
 +            CACHE_NONE    = 0,        // do not cache OpenCL binary
 +            CACHE_DEBUG   = 0x1 << 0, // cache OpenCL binary when built in debug mode
 +            CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode
 +            CACHE_ALL     = CACHE_DEBUG | CACHE_RELEASE, // cache opencl binary
 +        };
 +        //! Enable or disable OpenCL program binary caching onto local disk
 +        // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the
 +        // compiled OpenCL program to be cached to the path automatically as "path/*.clb"
 +        // binary file, which will be reused when the OpenCV executable is started again.
 +        //
 +        // This feature is enabled by default.
 +        CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./");
 +
 +        //! set where binary cache to be saved to
 +        CV_EXPORTS void setBinaryPath(const char *path);
 +
 +        struct ProgramSource
 +        {
 +            const char* name;
 +            const char* programStr;
 +            const char* programHash;
 +
 +            // Cache in memory by name (should be unique). Caching on disk disabled.
 +            inline ProgramSource(const char* _name, const char* _programStr)
 +                : name(_name), programStr(_programStr), programHash(NULL)
 +            {
 +            }
 +
 +            // Cache in memory by name (should be unique). Caching on disk uses programHash mark.
 +            inline ProgramSource(const char* _name, const char* _programStr, const char* _programHash)
 +                : name(_name), programStr(_programStr), programHash(_programHash)
 +            {
 +            }
 +        };
 +
 +        //! Calls OpenCL kernel. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
 +        //! Deprecated, will be replaced
 +        CV_EXPORTS void openCLExecuteKernelInterop(Context *clCxt,
 +                const cv::ocl::ProgramSource& source, String kernelName,
 +                size_t globalThreads[3], size_t localThreads[3],
 +                std::vector< std::pair<size_t, const void *> > &args,
 +                int channels, int depth, const char *build_options);
 +
 +        class CV_EXPORTS oclMatExpr;
 +        //////////////////////////////// oclMat ////////////////////////////////
 +        class CV_EXPORTS oclMat
 +        {
 +        public:
 +            //! default constructor
 +            oclMat();
 +            //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
 +            oclMat(int rows, int cols, int type);
 +            oclMat(Size size, int type);
 +            //! constucts oclMatrix and fills it with the specified value _s.
 +            oclMat(int rows, int cols, int type, const Scalar &s);
 +            oclMat(Size size, int type, const Scalar &s);
 +            //! copy constructor
 +            oclMat(const oclMat &m);
 +
 +            //! constructor for oclMatrix headers pointing to user-allocated data
 +            oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP);
 +            oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP);
 +
 +            //! creates a matrix header for a part of the bigger matrix
 +            oclMat(const oclMat &m, const Range &rowRange, const Range &colRange);
 +            oclMat(const oclMat &m, const Rect &roi);
 +
 +            //! builds oclMat from Mat. Perfom blocking upload to device.
 +            explicit oclMat (const Mat &m);
 +
 +            //! destructor - calls release()
 +            ~oclMat();
 +
 +            //! assignment operators
 +            oclMat &operator = (const oclMat &m);
 +            //! assignment operator. Perfom blocking upload to device.
 +            oclMat &operator = (const Mat &m);
 +            oclMat &operator = (const oclMatExpr& expr);
 +
 +            //! pefroms blocking upload data to oclMat.
 +            void upload(const cv::Mat &m);
 +
 +
 +            //! downloads data from device to host memory. Blocking calls.
 +            operator Mat() const;
 +            void download(cv::Mat &m) const;
 +
 +            //! convert to _InputArray
 +            operator _InputArray();
 +
 +            //! convert to _OutputArray
 +            operator _OutputArray();
 +
 +            //! returns a new oclMatrix header for the specified row
 +            oclMat row(int y) const;
 +            //! returns a new oclMatrix header for the specified column
 +            oclMat col(int x) const;
 +            //! ... for the specified row span
 +            oclMat rowRange(int startrow, int endrow) const;
 +            oclMat rowRange(const Range &r) const;
 +            //! ... for the specified column span
 +            oclMat colRange(int startcol, int endcol) const;
 +            oclMat colRange(const Range &r) const;
 +
 +            //! returns deep copy of the oclMatrix, i.e. the data is copied
 +            oclMat clone() const;
 +
 +            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
 +            // It calls m.create(this->size(), this->type()).
 +            // It supports any data type
 +            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
 +
 +            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
 +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
 +            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
 +
 +            void assignTo( oclMat &m, int type = -1 ) const;
 +
 +            //! sets every oclMatrix element to s
 +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
 +            oclMat& operator = (const Scalar &s);
 +            //! sets some of the oclMatrix elements to s, according to the mask
 +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
 +            oclMat& setTo(const Scalar &s, const oclMat &mask = oclMat());
 +            //! creates alternative oclMatrix header for the same data, with different
 +            // number of channels and/or different number of rows. see cvReshape.
 +            oclMat reshape(int cn, int rows = 0) const;
 +
 +            //! allocates new oclMatrix data unless the oclMatrix already has specified size and type.
 +            // previous data is unreferenced if needed.
 +            void create(int rows, int cols, int type);
 +            void create(Size size, int type);
 +
 +            //! allocates new oclMatrix with specified device memory type.
 +            void createEx(int rows, int cols, int type,
 +                          DevMemRW rw_type, DevMemType mem_type);
 +            void createEx(Size size, int type, DevMemRW rw_type,
 +                          DevMemType mem_type);
 +
 +            //! decreases reference counter;
 +            // deallocate the data when reference counter reaches 0.
 +            void release();
 +
 +            //! swaps with other smart pointer
 +            void swap(oclMat &mat);
 +
 +            //! locates oclMatrix header within a parent oclMatrix. See below
 +            void locateROI( Size &wholeSize, Point &ofs ) const;
 +            //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix.
 +            oclMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
 +            //! extracts a rectangular sub-oclMatrix
 +            // (this is a generalized form of row, rowRange etc.)
 +            oclMat operator()( Range rowRange, Range colRange ) const;
 +            oclMat operator()( const Rect &roi ) const;
 +
 +            oclMat& operator+=( const oclMat& m );
 +            oclMat& operator-=( const oclMat& m );
 +            oclMat& operator*=( const oclMat& m );
 +            oclMat& operator/=( const oclMat& m );
 +
 +            //! returns true if the oclMatrix data is continuous
 +            // (i.e. when there are no gaps between successive rows).
 +            // similar to CV_IS_oclMat_CONT(cvoclMat->type)
 +            bool isContinuous() const;
 +            //! returns element size in bytes,
 +            // similar to CV_ELEM_SIZE(cvMat->type)
 +            size_t elemSize() const;
 +            //! returns the size of element channel in bytes.
 +            size_t elemSize1() const;
 +            //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
 +            int type() const;
 +            //! returns element type, i.e. 8UC3 returns 8UC4 because in ocl
 +            //! 3 channels element actually use 4 channel space
 +            int ocltype() const;
 +            //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
 +            int depth() const;
 +            //! returns element type, similar to CV_MAT_CN(cvMat->type)
 +            int channels() const;
 +            //! returns element type, return 4 for 3 channels element,
 +            //!becuase 3 channels element actually use 4 channel space
 +            int oclchannels() const;
 +            //! returns step/elemSize1()
 +            size_t step1() const;
 +            //! returns oclMatrix size:
 +            // width == number of columns, height == number of rows
 +            Size size() const;
 +            //! returns true if oclMatrix data is NULL
 +            bool empty() const;
 +
 +            //! returns pointer to y-th row
 +            uchar* ptr(int y = 0);
 +            const uchar *ptr(int y = 0) const;
 +
 +            //! template version of the above method
 +            template<typename _Tp> _Tp *ptr(int y = 0);
 +            template<typename _Tp> const _Tp *ptr(int y = 0) const;
 +
 +            //! matrix transposition
 +            oclMat t() const;
 +
 +            /*! includes several bit-fields:
 +              - the magic signature
 +              - continuity flag
 +              - depth
 +              - number of channels
 +              */
 +            int flags;
 +            //! the number of rows and columns
 +            int rows, cols;
 +            //! a distance between successive rows in bytes; includes the gap if any
 +            size_t step;
 +            //! pointer to the data(OCL memory object)
 +            uchar *data;
 +
 +            //! pointer to the reference counter;
 +            // when oclMatrix points to user-allocated data, the pointer is NULL
 +            int *refcount;
 +
 +            //! helper fields used in locateROI and adjustROI
 +            //datastart and dataend are not used in current version
 +            uchar *datastart;
 +            uchar *dataend;
 +
 +            //! OpenCL context associated with the oclMat object.
 +            Context *clCxt; // TODO clCtx
 +            //add offset for handle ROI, calculated in byte
 +            int offset;
 +            //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
 +            int wholerows;
 +            int wholecols;
 +        };
 +
 +        // convert InputArray/OutputArray to oclMat references
 +        CV_EXPORTS oclMat& getOclMatRef(InputArray src);
 +        CV_EXPORTS oclMat& getOclMatRef(OutputArray src);
 +
 +        ///////////////////// mat split and merge /////////////////////////////////
 +        //! Compose a multi-channel array from several single-channel arrays
 +        // Support all types
 +        CV_EXPORTS void merge(const oclMat *src, size_t n, oclMat &dst);
 +        CV_EXPORTS void merge(const std::vector<oclMat> &src, oclMat &dst);
 +
 +        //! Divides multi-channel array into several single-channel arrays
 +        // Support all types
 +        CV_EXPORTS void split(const oclMat &src, oclMat *dst);
 +        CV_EXPORTS void split(const oclMat &src, std::vector<oclMat> &dst);
 +
 +        ////////////////////////////// Arithmetics ///////////////////////////////////
 +
 +        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
 +        // supports all data types
 +        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
 +
 +        //! adds one matrix to another (dst = src1 + src2)
 +        // supports all data types
 +        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        //! adds scalar to a matrix (dst = src1 + s)
 +        // supports all data types
 +        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! subtracts one matrix from another (dst = src1 - src2)
 +        // supports all data types
 +        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        //! subtracts scalar from a matrix (dst = src1 - s)
 +        // supports all data types
 +        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
 +        // supports all data types
 +        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
 +        //! multiplies matrix to a number (dst = scalar * src)
 +        // supports all data types
 +        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
 +
 +        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
 +        // supports all data types
 +        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
 +        //! computes element-wise quotient of the two arrays (dst = scale / src)
 +        // supports all data types
 +        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
 +
 +        //! computes element-wise minimum of the two arrays (dst = min(src1, src2))
 +        // supports all data types
 +        CV_EXPORTS void min(const oclMat &src1, const oclMat &src2, oclMat &dst);
 +
 +        //! computes element-wise maximum of the two arrays (dst = max(src1, src2))
 +        // supports all data types
 +        CV_EXPORTS void max(const oclMat &src1, const oclMat &src2, oclMat &dst);
 +
 +        //! compares elements of two arrays (dst = src1 <cmpop> src2)
 +        // supports all data types
 +        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);
 +
 +        //! transposes the matrix
 +        // supports all data types
 +        CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);
 +
 +        //! computes element-wise absolute values of an array (dst = abs(src))
 +        // supports all data types
 +        CV_EXPORTS void abs(const oclMat &src, oclMat &dst);
 +
 +        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
 +        // supports all data types
 +        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
 +        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
 +        // supports all data types
 +        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);
 +
 +        //! computes mean value and standard deviation of all or selected array elements
 +        // supports all data types
 +        CV_EXPORTS void meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev);
 +
 +        //! computes norm of array
 +        // supports NORM_INF, NORM_L1, NORM_L2
 +        // supports all data types
 +        CV_EXPORTS double norm(const oclMat &src1, int normType = NORM_L2);
 +
 +        //! computes norm of the difference between two arrays
 +        // supports NORM_INF, NORM_L1, NORM_L2
 +        // supports all data types
 +        CV_EXPORTS double norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2);
 +
 +        //! reverses the order of the rows, columns or both in a matrix
 +        // supports all types
 +        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);
 +
 +        //! computes sum of array elements
 +        // support all types
 +        CV_EXPORTS Scalar sum(const oclMat &m);
 +        CV_EXPORTS Scalar absSum(const oclMat &m);
 +        CV_EXPORTS Scalar sqrSum(const oclMat &m);
 +
 +        //! finds global minimum and maximum array elements and returns their values
 +        // support all C1 types
 +        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
 +
 +        //! finds global minimum and maximum array elements and returns their values with locations
 +        // support all C1 types
 +        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
 +                                  const oclMat &mask = oclMat());
 +
 +        //! counts non-zero array elements
 +        // support all types
 +        CV_EXPORTS int countNonZero(const oclMat &src);
 +
 +        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
 +        // destination array will have the depth type as lut and the same channels number as source
 +        //It supports 8UC1 8UC4 only
 +        CV_EXPORTS void LUT(const oclMat &src, const oclMat &lut, oclMat &dst);
 +
 +        //! only 8UC1 and 256 bins is supported now
 +        CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
 +        //! only 8UC1 and 256 bins is supported now
 +        CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
 +
 +        //! only 8UC1 is supported now
 +        CV_EXPORTS Ptr<cv::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
 +
 +        //! bilateralFilter
 +        // supports 8UC1 8UC4
 +        CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT);
 +
 +        //! Applies an adaptive bilateral filter to the input image
 +        //  This is not truly a bilateral filter. Instead of using user provided fixed parameters,
 +        //  the function calculates a constant at each window based on local standard deviation,
 +        //  and use this constant to do filtering.
 +        //  supports 8UC1, 8UC3
 +        CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);
 +
 +        //! computes exponent of each matrix element (dst = e**src)
 +        // supports only CV_32FC1, CV_64FC1 type
 +        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);
 +
 +        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
 +        // supports only CV_32FC1, CV_64FC1 type
 +        CV_EXPORTS void log(const oclMat &src, oclMat &dst);
 +
 +        //! computes magnitude of each (x(i), y(i)) vector
 +        // supports only CV_32F, CV_64F type
 +        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
 +
 +        //! computes angle (angle(i)) of each (x(i), y(i)) vector
 +        // supports only CV_32F, CV_64F type
 +        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
 +
 +        //! the function raises every element of tne input array to p
 +        // support only CV_32F, CV_64F type
 +        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
 +
 +        //! converts Cartesian coordinates to polar
 +        // supports only CV_32F CV_64F type
 +        CV_EXPORTS void cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false);
 +
 +        //! converts polar coordinates to Cartesian
 +        // supports only CV_32F CV_64F type
 +        CV_EXPORTS void polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false);
 +
 +        //! perfroms per-elements bit-wise inversion
 +        // supports all types
 +        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
 +
 +        //! calculates per-element bit-wise disjunction of two arrays
 +        // supports all types
 +        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! calculates per-element bit-wise conjunction of two arrays
 +        // supports all types
 +        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! calculates per-element bit-wise "exclusive or" operation
 +        // supports all types
 +        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! Logical operators
 +        CV_EXPORTS oclMat operator ~ (const oclMat &);
 +        CV_EXPORTS oclMat operator | (const oclMat &, const oclMat &);
 +        CV_EXPORTS oclMat operator & (const oclMat &, const oclMat &);
 +        CV_EXPORTS oclMat operator ^ (const oclMat &, const oclMat &);
 +
 +
 +        //! Mathematics operators
 +        CV_EXPORTS oclMatExpr operator + (const oclMat &src1, const oclMat &src2);
 +        CV_EXPORTS oclMatExpr operator - (const oclMat &src1, const oclMat &src2);
 +        CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
 +        CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
 +
 +        struct CV_EXPORTS ConvolveBuf
 +        {
 +            Size result_size;
 +            Size block_size;
 +            Size user_block_size;
 +            Size dft_size;
 +
 +            oclMat image_spect, templ_spect, result_spect;
 +            oclMat image_block, templ_block, result_data;
 +
 +            void create(Size image_size, Size templ_size);
 +            static Size estimateBlockSize(Size result_size, Size templ_size);
 +        };
 +
 +        //! computes convolution of two images, may use discrete Fourier transform
 +        // support only CV_32FC1 type
 +        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
 +        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
 +
 +        //! Performs a per-element multiplication of two Fourier spectrums.
 +        //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
 +        //! support only CV_32FC2 type
 +        CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false);
 +
 +        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code, int dcn = 0);
 +
 +        //! initializes a scaled identity matrix
 +        CV_EXPORTS void setIdentity(oclMat& src, const Scalar & val = Scalar(1));
 +
 +        //////////////////////////////// Filter Engine ////////////////////////////////
 +
 +        /*!
 +          The Base Class for 1D or Row-wise Filters
 +
 +          This is the base class for linear or non-linear filters that process 1D data.
 +          In particular, such filters are used for the "horizontal" filtering parts in separable filters.
 +          */
 +        class CV_EXPORTS BaseRowFilter_GPU
 +        {
 +        public:
 +            BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
 +            virtual ~BaseRowFilter_GPU() {}
 +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
 +            int ksize, anchor, bordertype;
 +        };
 +
 +        /*!
 +          The Base Class for Column-wise Filters
 +
 +          This is the base class for linear or non-linear filters that process columns of 2D arrays.
 +          Such filters are used for the "vertical" filtering parts in separable filters.
 +          */
 +        class CV_EXPORTS BaseColumnFilter_GPU
 +        {
 +        public:
 +            BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
 +            virtual ~BaseColumnFilter_GPU() {}
 +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
 +            int ksize, anchor, bordertype;
 +        };
 +
 +        /*!
 +          The Base Class for Non-Separable 2D Filters.
 +
 +          This is the base class for linear or non-linear 2D filters.
 +          */
 +        class CV_EXPORTS BaseFilter_GPU
 +        {
 +        public:
 +            BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
 +                : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
 +            virtual ~BaseFilter_GPU() {}
 +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
 +            Size ksize;
 +            Point anchor;
 +            int borderType;
 +        };
 +
 +        /*!
 +          The Base Class for Filter Engine.
 +
 +          The class can be used to apply an arbitrary filtering operation to an image.
 +          It contains all the necessary intermediate buffers.
 +          */
 +        class CV_EXPORTS FilterEngine_GPU
 +        {
 +        public:
 +            virtual ~FilterEngine_GPU() {}
 +
 +            virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
 +        };
 +
 +        //! returns the non-separable filter engine with the specified filter
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
 +
 +        //! returns the primitive row filter with the specified kernel
 +        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel,
 +                int anchor = -1, int bordertype = BORDER_DEFAULT);
 +
 +        //! returns the primitive column filter with the specified kernel
 +        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel,
 +                int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0);
 +
 +        //! returns the separable linear filter engine
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
 +                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! returns the separable filter engine with the specified filters
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
 +                const Ptr<BaseColumnFilter_GPU> &columnFilter);
 +
 +        //! returns the Gaussian filter engine
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
 +
 +        //! returns filter engine for the generalized Sobel operator
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
 +
 +        //! applies Laplacian operator to the image
-         // supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type
-         CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1);
++        // supports only ksize = 1 and ksize = 3
++        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1,
++                double delta=0, int borderType=BORDER_DEFAULT);
 +
 +        //! returns 2D box filter
-         // supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
++        // dst type must be the same as source type
 +        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
 +                const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns box filter engine
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size &ksize,
 +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns 2D filter with the specified kernel
-         // supports CV_8UC1 and CV_8UC4 types
++        // supports: dst type must be the same as source type
 +        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
 +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns the non-separable linear filter engine
++        // supports: dst type must be the same as source type
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
 +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! smooths the image using the normalized box filter
-         // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-         // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
 +        CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
 +                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns 2D morphological filter
 +        //! only MORPH_ERODE and MORPH_DILATE are supported
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        // kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
 +        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize,
 +                Point anchor = Point(-1, -1));
 +
 +        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat &kernel,
 +                const Point &anchor = Point(-1, -1), int iterations = 1);
 +
 +        //! a synonym for normalized box filter
-         // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-         // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
 +                                int borderType = BORDER_CONSTANT)
 +        {
 +            boxFilter(src, dst, -1, ksize, anchor, borderType);
 +        }
 +
 +        //! applies non-separable 2D linear filter to the image
-         //  Note, at the moment this function only works when anchor point is in the kernel center
-         //  and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result
 +        CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
-                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
++                                 Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT);
 +
 +        //! applies separable 2D linear filter to the image
 +        CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
 +                                    Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! applies generalized Sobel operator to the image
 +        // dst.type must equalize src.type
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        CV_EXPORTS void Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! applies the vertical or horizontal Scharr operator to the image
 +        // dst.type must equalize src.type
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        CV_EXPORTS void Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! smooths the image using Gaussian filter.
 +        // dst.type must equalize src.type
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        CV_EXPORTS void GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
 +
 +        //! erodes the image (applies the local minimum operator)
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
 +
 +                               int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
 +
 +
 +        //! dilates the image (applies the local maximum operator)
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
 +
 +                                int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
 +
 +
 +        //! applies an advanced morphological operation to the image
 +        CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
 +
 +                                      int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
 +
 +
 +        ////////////////////////////// Image processing //////////////////////////////
 +        //! Does mean shift filtering on GPU.
 +        CV_EXPORTS void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr,
 +                                           TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 +
 +        //! Does mean shift procedure on GPU.
 +        CV_EXPORTS void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr,
 +                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 +
 +        //! Does mean shift segmentation with elimiation of small regions.
 +        CV_EXPORTS void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize,
 +                                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 +
 +        //! applies fixed threshold to the image.
 +        // supports CV_8UC1 and CV_32FC1 data type
 +        // supports threshold type: THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV
 +        CV_EXPORTS double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC);
 +
 +        //! resizes the image
 +        // Supports INTER_NEAREST, INTER_LINEAR
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        CV_EXPORTS void resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
 +
 +        //! Applies a generic geometrical transformation to an image.
 +
 +        // Supports INTER_NEAREST, INTER_LINEAR.
 +        // Map1 supports CV_16SC2, CV_32FC2  types.
 +        // Src supports CV_8UC1, CV_8UC2, CV_8UC4.
 +        CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar());
 +
 +        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
 +        // supports CV_8UC1, CV_8UC4, CV_32SC1 types
 +        CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar());
 +
 +        //! Smoothes image using median filter
 +        // The source 1- or 4-channel image. m should be 3 or 5, the image depth should be CV_8U or CV_32F.
 +        CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m);
 +
 +        //! warps the image using affine transformation
 +        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        CV_EXPORTS void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
 +
 +        //! warps the image using perspective transformation
 +        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        CV_EXPORTS void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
 +
 +        //! computes the integral image and integral for the squared image
 +        // sum will have CV_32S type, sqsum - CV32F type
 +        // supports only CV_8UC1 source type
 +        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
 +        CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
 +        CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
 +        CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
 +            int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
 +        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 +        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
 +            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 +
 +
 +        /////////////////////////////////// ML ///////////////////////////////////////////
 +
 +        //! Compute closest centers for each lines in source and lable it after center's index
 +        // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
-         CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers);
++        // supports NORM_L1 and NORM_L2 distType
++        // if indices is provided, only the indexed rows will be calculated and their results are in the same
++        // order of indices
++        CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType = NORM_L2SQR, const oclMat &indices = oclMat());
 +
 +        //!Does k-means procedure on GPU
 +        // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
 +        CV_EXPORTS double kmeans(const oclMat &src, int K, oclMat &bestLabels,
 +                                     TermCriteria criteria, int attemps, int flags, oclMat &centers);
 +
 +
 +        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 +        ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
 +        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 +        class CV_EXPORTS OclCascadeClassifier : public  cv::CascadeClassifier
 +        {
 +        public:
 +            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
 +                                  double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
 +                                  Size minSize = Size(), Size maxSize = Size());
 +        };
 +
 +        /////////////////////////////// Pyramid /////////////////////////////////////
 +        CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
 +
 +        //! upsamples the source image and then smoothes it
 +        CV_EXPORTS void pyrUp(const oclMat &src, oclMat &dst);
 +
 +        //! performs linear blending of two images
 +        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
 +        // supports only CV_8UC1 source type
 +        CV_EXPORTS void blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2, oclMat &result);
 +
 +        //! computes vertical sum, supports only CV_32FC1 images
 +        CV_EXPORTS void columnSum(const oclMat &src, oclMat &sum);
 +
 +        ///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
 +        struct CV_EXPORTS MatchTemplateBuf
 +        {
 +            Size user_block_size;
 +            oclMat imagef, templf;
 +            std::vector<oclMat> images;
 +            std::vector<oclMat> image_sums;
 +            std::vector<oclMat> image_sqsums;
 +        };
 +
 +        //! computes the proximity map for the raster template and the image where the template is searched for
 +        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
 +        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 +        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method);
 +
 +        //! computes the proximity map for the raster template and the image where the template is searched for
 +        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
 +        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 +        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf);
 +
 +
 +
 +        ///////////////////////////////////////////// Canny /////////////////////////////////////////////
 +        struct CV_EXPORTS CannyBuf;
 +
 +        //! compute edges of the input image using Canny operator
 +        // Support CV_8UC1 only
 +        CV_EXPORTS void Canny(const oclMat &image, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 +        CV_EXPORTS void Canny(const oclMat &image, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 +        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
 +        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
 +
 +        struct CV_EXPORTS CannyBuf
 +        {
-             CannyBuf() : counter(NULL) {}
++            CannyBuf() : counter(1, 1, CV_32S) { }
 +            ~CannyBuf()
 +            {
 +                release();
 +            }
-             explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL)
++            explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S)
 +            {
 +                create(image_size, apperture_size);
 +            }
 +            CannyBuf(const oclMat &dx_, const oclMat &dy_);
 +            void create(const Size &image_size, int apperture_size = 3);
 +            void release();
 +
 +            oclMat dx, dy;
 +            oclMat dx_buf, dy_buf;
 +            oclMat magBuf, mapBuf;
 +            oclMat trackBuf1, trackBuf2;
-             void *counter;
++            oclMat counter;
 +            Ptr<FilterEngine_GPU> filterDX, filterDY;
 +        };
 +
 +        ///////////////////////////////////////// Hough Transform /////////////////////////////////////////
 +        //! HoughCircles
 +        struct HoughCirclesBuf
 +        {
 +            oclMat edges;
 +            oclMat accum;
 +            oclMat srcPoints;
 +            oclMat centers;
 +            CannyBuf cannyBuf;
 +        };
 +
 +        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
 +        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
 +        CV_EXPORTS void HoughCirclesDownload(const oclMat& d_circles, OutputArray h_circles);
 +
 +
 +        ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
 +        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
 +        //! Param dft_size is the size of DFT transform.
 +        //!
 +        //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format.
 +        // support src type of CV32FC1, CV32FC2
 +        // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS
 +        // dft_size is the size of original input, which is used for transformation from complex to real.
 +        // dft_size must be powers of 2, 3 and 5
 +        // real to complex dft requires at least v1.8 clAmdFft
 +        // real to complex dft output is not the same with cpu version
 +        // real to complex and complex to real does not support DFT_ROWS
 +        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);
 +
 +        //! implements generalized matrix product algorithm GEMM from BLAS
 +        // The functionality requires clAmdBlas library
 +        // only support type CV_32FC1
 +        // flag GEMM_3_T is not supported
 +        CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha,
 +                             const oclMat &src3, double beta, oclMat &dst, int flags = 0);
 +
 +        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
 +
 +        struct CV_EXPORTS HOGDescriptor
 +
 +        {
 +
 +            enum { DEFAULT_WIN_SIGMA = -1 };
 +
 +            enum { DEFAULT_NLEVELS = 64 };
 +
 +            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
 +
 +
 +
 +            HOGDescriptor(Size win_size = Size(64, 128), Size block_size = Size(16, 16),
 +
 +                          Size block_stride = Size(8, 8), Size cell_size = Size(8, 8),
 +
 +                          int nbins = 9, double win_sigma = DEFAULT_WIN_SIGMA,
 +
 +                          double threshold_L2hys = 0.2, bool gamma_correction = true,
 +
 +                          int nlevels = DEFAULT_NLEVELS);
 +
 +
 +
 +            size_t getDescriptorSize() const;
 +
 +            size_t getBlockHistogramSize() const;
 +
 +
 +
 +            void setSVMDetector(const std::vector<float> &detector);
 +
 +
 +
 +            static std::vector<float> getDefaultPeopleDetector();
 +
 +            static std::vector<float> getPeopleDetector48x96();
 +
 +            static std::vector<float> getPeopleDetector64x128();
 +
 +
 +
 +            void detect(const oclMat &img, std::vector<Point> &found_locations,
 +
 +                        double hit_threshold = 0, Size win_stride = Size(),
 +
 +                        Size padding = Size());
 +
 +
 +
 +            void detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations,
 +
 +                                  double hit_threshold = 0, Size win_stride = Size(),
 +
 +                                  Size padding = Size(), double scale0 = 1.05,
 +
 +                                  int group_threshold = 2);
 +
 +
 +
 +            void getDescriptors(const oclMat &img, Size win_stride,
 +
 +                                oclMat &descriptors,
 +
 +                                int descr_format = DESCR_FORMAT_COL_BY_COL);
 +
 +
 +
 +            Size win_size;
 +
 +            Size block_size;
 +
 +            Size block_stride;
 +
 +            Size cell_size;
 +
 +            int nbins;
 +
 +            double win_sigma;
 +
 +            double threshold_L2hys;
 +
 +            bool gamma_correction;
 +
 +            int nlevels;
 +
 +
 +
 +        protected:
 +
 +            // initialize buffers; only need to do once in case of multiscale detection
 +
 +            void init_buffer(const oclMat &img, Size win_stride);
 +
 +
 +
 +            void computeBlockHistograms(const oclMat &img);
 +
 +            void computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle);
 +
 +
 +
 +            double getWinSigma() const;
 +
 +            bool checkDetectorSize() const;
 +
 +
 +
 +            static int numPartsWithin(int size, int part_size, int stride);
 +
 +            static Size numPartsWithin(Size size, Size part_size, Size stride);
 +
 +
 +
 +            // Coefficients of the separating plane
 +
 +            float free_coef;
 +
 +            oclMat detector;
 +
 +
 +
 +            // Results of the last classification step
 +
 +            oclMat labels;
 +
 +            Mat labels_host;
 +
 +
 +
 +            // Results of the last histogram evaluation step
 +
 +            oclMat block_hists;
 +
 +
 +
 +            // Gradients conputation results
 +
 +            oclMat grad, qangle;
 +
 +
 +
 +            // scaled image
 +
 +            oclMat image_scale;
 +
 +
 +
 +            // effect size of input image (might be different from original size after scaling)
 +
 +            Size effect_size;
 +
 +        };
 +
 +
 +        ////////////////////////feature2d_ocl/////////////////
 +        /****************************************************************************************\
 +        *                                      Distance                                          *
 +        \****************************************************************************************/
 +        template<typename T>
 +        struct CV_EXPORTS Accumulator
 +        {
 +            typedef T Type;
 +        };
 +        template<> struct Accumulator<unsigned char>
 +        {
 +            typedef float Type;
 +        };
 +        template<> struct Accumulator<unsigned short>
 +        {
 +            typedef float Type;
 +        };
 +        template<> struct Accumulator<char>
 +        {
 +            typedef float Type;
 +        };
 +        template<> struct Accumulator<short>
 +        {
 +            typedef float Type;
 +        };
 +
 +        /*
 +         * Manhattan distance (city block distance) functor
 +         */
 +        template<class T>
 +        struct CV_EXPORTS L1
 +        {
 +            enum { normType = NORM_L1 };
 +            typedef T ValueType;
 +            typedef typename Accumulator<T>::Type ResultType;
 +
 +            ResultType operator()( const T *a, const T *b, int size ) const
 +            {
 +                return normL1<ValueType, ResultType>(a, b, size);
 +            }
 +        };
 +
 +        /*
 +         * Euclidean distance functor
 +         */
 +        template<class T>
 +        struct CV_EXPORTS L2
 +        {
 +            enum { normType = NORM_L2 };
 +            typedef T ValueType;
 +            typedef typename Accumulator<T>::Type ResultType;
 +
 +            ResultType operator()( const T *a, const T *b, int size ) const
 +            {
 +                return (ResultType)std::sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
 +            }
 +        };
 +
 +        /*
 +         * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
 +         * bit count of A exclusive XOR'ed with B
 +         */
 +        struct CV_EXPORTS Hamming
 +        {
 +            enum { normType = NORM_HAMMING };
 +            typedef unsigned char ValueType;
 +            typedef int ResultType;
 +
 +            /** this will count the bits in a ^ b
 +             */
 +            ResultType operator()( const unsigned char *a, const unsigned char *b, int size ) const
 +            {
 +                return normHamming(a, b, size);
 +            }
 +        };
 +
 +        ////////////////////////////////// BruteForceMatcher //////////////////////////////////
 +
 +        class CV_EXPORTS BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            enum DistType {L1Dist = 0, L2Dist, HammingDist};
 +            explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
 +
 +            // Add descriptors to train descriptor collection
 +            void add(const std::vector<oclMat> &descCollection);
 +
 +            // Get train descriptors collection
 +            const std::vector<oclMat> &getTrainDescriptors() const;
 +
 +            // Clear train descriptors collection
 +            void clear();
 +
 +            // Return true if there are not train descriptors in collection
 +            bool empty() const;
 +
 +            // Return true if the matcher supports mask in match methods
 +            bool isMaskSupported() const;
 +
 +            // Find one best match for each query descriptor
 +            void matchSingle(const oclMat &query, const oclMat &train,
 +                             oclMat &trainIdx, oclMat &distance,
 +                             const oclMat &mask = oclMat());
 +
 +            // Download trainIdx and distance and convert it to CPU vector with DMatch
 +            static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
 +            // Convert trainIdx and distance to vector with DMatch
 +            static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
 +
 +            // Find one best match for each query descriptor
 +            void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
 +
 +            // Make gpu collection of trains and masks in suitable format for matchCollection function
 +            void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
 +
 +            // Find one best match from train collection for each query descriptor
 +            void matchCollection(const oclMat &query, const oclMat &trainCollection,
 +                                 oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
 +                                 const oclMat &masks = oclMat());
 +
 +            // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
 +            static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
 +            // Convert trainIdx, imgIdx and distance to vector with DMatch
 +            static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
 +
 +            // Find one best match from train collection for each query descriptor.
 +            void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
 +
 +            // Find k best matches for each query descriptor (in increasing order of distances)
 +            void knnMatchSingle(const oclMat &query, const oclMat &train,
 +                                oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
 +                                const oclMat &mask = oclMat());
 +
 +            // Download trainIdx and distance and convert it to vector with DMatch
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
 +                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx and distance to vector with DMatch
 +            static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
 +                                        std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find k best matches for each query descriptor (in increasing order of distances).
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            void knnMatch(const oclMat &query, const oclMat &train,
 +                          std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
 +                          bool compactResult = false);
 +
 +            // Find k best matches from train collection for each query descriptor (in increasing order of distances)
 +            void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
 +                                     oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
 +                                     const oclMat &maskCollection = oclMat());
 +
 +            // Download trainIdx and distance and convert it to vector with DMatch
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
 +                                          std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx and distance to vector with DMatch
 +            static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
 +                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find k best matches  for each query descriptor (in increasing order of distances).
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
 +                          const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
 +
 +            // Find best matches for each query descriptor which have distance less than maxDistance.
 +            // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
 +            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
 +            // because it didn't have enough memory.
 +            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
 +            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
 +            // Matches doesn't sorted.
 +            void radiusMatchSingle(const oclMat &query, const oclMat &train,
 +                                   oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
 +                                   const oclMat &mask = oclMat());
 +
 +            // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
 +            // matches will be sorted in increasing order of distances.
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
 +                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx, nMatches and distance to vector with DMatch.
 +            static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
 +                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find best matches for each query descriptor which have distance less than maxDistance
 +            // in increasing order of distances).
 +            void radiusMatch(const oclMat &query, const oclMat &train,
 +                             std::vector< std::vector<DMatch> > &matches, float maxDistance,
 +                             const oclMat &mask = oclMat(), bool compactResult = false);
 +
 +            // Find best matches for each query descriptor which have distance less than maxDistance.
 +            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
 +            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
 +            // Matches doesn't sorted.
 +            void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
 +                                       const std::vector<oclMat> &masks = std::vector<oclMat>());
 +
 +            // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
 +            // matches will be sorted in increasing order of distances.
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
 +                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx, nMatches and distance to vector with DMatch.
 +            static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
 +                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find best matches from train collection for each query descriptor which have distance less than
 +            // maxDistance (in increasing order of distances).
 +            void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
 +                             const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
 +
 +            DistType distType;
 +
 +        private:
 +            std::vector<oclMat> trainDescCollection;
 +        };
 +
 +        template <class Distance>
 +        class CV_EXPORTS BruteForceMatcher_OCL;
 +
 +        template <typename T>
 +        class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
 +            explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
 +        };
 +        template <typename T>
 +        class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
 +            explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
 +        };
 +        template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
 +            explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
 +        };
 +
 +        class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
 +        };
 +
 +        class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
 +        {
 +        public:
 +            explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
 +                int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
 +
 +            //! return 1 rows matrix with CV_32FC2 type
 +            void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
 +            //! download points of type Point2f to a vector. the vector's content will be erased
 +            void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
 +
 +            int maxCorners;
 +            double qualityLevel;
 +            double minDistance;
 +
 +            int blockSize;
 +            bool useHarrisDetector;
 +            double harrisK;
 +            void releaseMemory()
 +            {
 +                Dx_.release();
 +                Dy_.release();
 +                eig_.release();
 +                minMaxbuf_.release();
 +                tmpCorners_.release();
 +            }
 +        private:
 +            oclMat Dx_;
 +            oclMat Dy_;
 +            oclMat eig_;
 +            oclMat minMaxbuf_;
 +            oclMat tmpCorners_;
 +        };
 +
 +        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
 +            int blockSize_, bool useHarrisDetector_, double harrisK_)
 +        {
 +            maxCorners = maxCorners_;
 +            qualityLevel = qualityLevel_;
 +            minDistance = minDistance_;
 +            blockSize = blockSize_;
 +            useHarrisDetector = useHarrisDetector_;
 +            harrisK = harrisK_;
 +        }
 +
 +        /////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
 +
 +        class CV_EXPORTS PyrLKOpticalFlow
 +        {
 +        public:
 +            PyrLKOpticalFlow()
 +            {
 +                winSize = Size(21, 21);
 +                maxLevel = 3;
 +                iters = 30;
 +                derivLambda = 0.5;
 +                useInitialFlow = false;
 +                minEigThreshold = 1e-4f;
 +                getMinEigenVals = false;
 +                isDeviceArch11_ = false;
 +            }
 +
 +            void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
 +                        oclMat &status, oclMat *err = 0);
 +
 +            void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
 +
 +            Size winSize;
 +            int maxLevel;
 +            int iters;
 +            double derivLambda;
 +            bool useInitialFlow;
 +            float minEigThreshold;
 +            bool getMinEigenVals;
 +
 +            void releaseMemory()
 +            {
 +                dx_calcBuf_.release();
 +                dy_calcBuf_.release();
 +
 +                prevPyr_.clear();
 +                nextPyr_.clear();
 +
 +                dx_buf_.release();
 +                dy_buf_.release();
 +            }
 +
 +        private:
 +            void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
 +
 +            void buildImagePyramid(const oclMat &img0, std::vector<oclMat> &pyr, bool withBorder);
 +
 +            oclMat dx_calcBuf_;
 +            oclMat dy_calcBuf_;
 +
 +            std::vector<oclMat> prevPyr_;
 +            std::vector<oclMat> nextPyr_;
 +
 +            oclMat dx_buf_;
 +            oclMat dy_buf_;
 +
 +            oclMat uPyr_[2];
 +            oclMat vPyr_[2];
 +
 +            bool isDeviceArch11_;
 +        };
 +
 +        class CV_EXPORTS FarnebackOpticalFlow
 +        {
 +        public:
 +            FarnebackOpticalFlow();
 +
 +            int numLevels;
 +            double pyrScale;
 +            bool fastPyramids;
 +            int winSize;
 +            int numIters;
 +            int polyN;
 +            double polySigma;
 +            int flags;
 +
 +            void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy);
 +
 +            void releaseMemory();
 +
 +        private:
 +            void prepareGaussian(
 +                int n, double sigma, float *g, float *xg, float *xxg,
 +                double &ig11, double &ig03, double &ig33, double &ig55);
 +
 +            void setPolynomialExpansionConsts(int n, double sigma);
 +
 +            void updateFlow_boxFilter(
 +                const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat &flowy,
 +                oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices);
 +
 +            void updateFlow_gaussianBlur(
 +                const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat& flowy,
 +                oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices);
 +
 +            oclMat frames_[2];
 +            oclMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
 +            std::vector<oclMat> pyramid0_, pyramid1_;
 +        };
 +
 +        //////////////// build warping maps ////////////////////
 +        //! builds plane warping maps
 +        CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, const Mat &T, float scale, oclMat &map_x, oclMat &map_y);
 +        //! builds cylindrical warping maps
 +        CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
 +        //! builds spherical warping maps
 +        CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
 +        //! builds Affine warping maps
 +        CV_EXPORTS void buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
 +
 +        //! builds Perspective warping maps
 +        CV_EXPORTS void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
 +
 +        ///////////////////////////////////// interpolate frames //////////////////////////////////////////////
 +        //! Interpolate frames (images) using provided optical flow (displacement field).
 +        //! frame0   - frame 0 (32-bit floating point images, single channel)
 +        //! frame1   - frame 1 (the same type and size)
 +        //! fu       - forward horizontal displacement
 +        //! fv       - forward vertical displacement
 +        //! bu       - backward horizontal displacement
 +        //! bv       - backward vertical displacement
 +        //! pos      - new frame position
 +        //! newFrame - new frame
 +        //! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat;
 +        //!            occlusion masks            0, occlusion masks            1,
 +        //!            interpolated forward flow  0, interpolated forward flow  1,
 +        //!            interpolated backward flow 0, interpolated backward flow 1
 +        //!
 +        CV_EXPORTS void interpolateFrames(const oclMat &frame0, const oclMat &frame1,
 +                                          const oclMat &fu, const oclMat &fv,
 +                                          const oclMat &bu, const oclMat &bv,
 +                                          float pos, oclMat &newFrame, oclMat &buf);
 +
 +        //! computes moments of the rasterized shape or a vector of points
-         CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage);
++        //! _array should be a vector a points standing for the contour
++        CV_EXPORTS Moments ocl_moments(InputArray contour);
++        //! src should be a general image uploaded to the GPU.
++        //! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
++        //! to use type of CV_64FC1, the GPU should support CV_64FC1
++        CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary);
 +
 +        class CV_EXPORTS StereoBM_OCL
 +        {
 +        public:
 +            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
 +
 +            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
 +
 +            //! the default constructor
 +            StereoBM_OCL();
 +            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
 +            StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
 +
 +            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
 +            //! Output disparity has CV_8U type.
 +            void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
 +
 +            //! Some heuristics that tries to estmate
 +            // if current GPU will be faster then CPU in this algorithm.
 +            // It queries current active device.
 +            static bool checkIfGpuCallReasonable();
 +
 +            int preset;
 +            int ndisp;
 +            int winSize;
 +
 +            // If avergeTexThreshold  == 0 => post procesing is disabled
 +            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
 +            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
 +            // i.e. input left image is low textured.
 +            float avergeTexThreshold;
 +        private:
 +            oclMat minSSD, leBuf, riBuf;
 +        };
 +
 +        class CV_EXPORTS StereoBeliefPropagation
 +        {
 +        public:
 +            enum { DEFAULT_NDISP  = 64 };
 +            enum { DEFAULT_ITERS  = 5  };
 +            enum { DEFAULT_LEVELS = 5  };
 +            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
 +            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
 +                                             int iters  = DEFAULT_ITERS,
 +                                             int levels = DEFAULT_LEVELS,
 +                                             int msg_type = CV_16S);
 +            StereoBeliefPropagation(int ndisp, int iters, int levels,
 +                                    float max_data_term, float data_weight,
 +                                    float max_disc_term, float disc_single_jump,
 +                                    int msg_type = CV_32F);
 +            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
 +            void operator()(const oclMat &data, oclMat &disparity);
 +            int ndisp;
 +            int iters;
 +            int levels;
 +            float max_data_term;
 +            float data_weight;
 +            float max_disc_term;
 +            float disc_single_jump;
 +            int msg_type;
 +        private:
 +            oclMat u, d, l, r, u2, d2, l2, r2;
 +            std::vector<oclMat> datas;
 +            oclMat out;
 +        };
 +
 +        class CV_EXPORTS StereoConstantSpaceBP
 +        {
 +        public:
 +            enum { DEFAULT_NDISP    = 128 };
 +            enum { DEFAULT_ITERS    = 8   };
 +            enum { DEFAULT_LEVELS   = 4   };
 +            enum { DEFAULT_NR_PLANE = 4   };
 +            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
 +            explicit StereoConstantSpaceBP(
 +                int ndisp    = DEFAULT_NDISP,
 +                int iters    = DEFAULT_ITERS,
 +                int levels   = DEFAULT_LEVELS,
 +                int nr_plane = DEFAULT_NR_PLANE,
 +                int msg_type = CV_32F);
 +            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
 +                float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
 +                int min_disp_th = 0,
 +                int msg_type = CV_32F);
 +            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
 +            int ndisp;
 +            int iters;
 +            int levels;
 +            int nr_plane;
 +            float max_data_term;
 +            float data_weight;
 +            float max_disc_term;
 +            float disc_single_jump;
 +            int min_disp_th;
 +            int msg_type;
 +            bool use_local_init_data_cost;
 +        private:
 +            oclMat u[2], d[2], l[2], r[2];
 +            oclMat disp_selected_pyr[2];
 +            oclMat data_cost;
 +            oclMat data_cost_selected;
 +            oclMat temp;
 +            oclMat out;
 +        };
 +
 +        // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
 +        //
 +        // see reference:
 +        //   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
 +        //   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
 +        class CV_EXPORTS OpticalFlowDual_TVL1_OCL
 +        {
 +        public:
 +            OpticalFlowDual_TVL1_OCL();
 +
 +            void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
 +
 +            void collectGarbage();
 +
 +            /**
 +            * Time step of the numerical scheme.
 +            */
 +            double tau;
 +
 +            /**
 +            * Weight parameter for the data term, attachment parameter.
 +            * This is the most relevant parameter, which determines the smoothness of the output.
 +            * The smaller this parameter is, the smoother the solutions we obtain.
 +            * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
 +            */
 +            double lambda;
 +
 +            /**
 +            * Weight parameter for (u - v)^2, tightness parameter.
 +            * It serves as a link between the attachment and the regularization terms.
 +            * In theory, it should have a small value in order to maintain both parts in correspondence.
 +            * The method is stable for a large range of values of this parameter.
 +            */
 +            double theta;
 +
 +            /**
 +            * Number of scales used to create the pyramid of images.
 +            */
 +            int nscales;
 +
 +            /**
 +            * Number of warpings per scale.
 +            * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
 +            * This is a parameter that assures the stability of the method.
 +            * It also affects the running time, so it is a compromise between speed and accuracy.
 +            */
 +            int warps;
 +
 +            /**
 +            * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
 +            * A small value will yield more accurate solutions at the expense of a slower convergence.
 +            */
 +            double epsilon;
 +
 +            /**
 +            * Stopping criterion iterations number used in the numerical scheme.
 +            */
 +            int iterations;
 +
 +            bool useInitialFlow;
 +
 +        private:
 +            void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
 +
 +            std::vector<oclMat> I0s;
 +            std::vector<oclMat> I1s;
 +            std::vector<oclMat> u1s;
 +            std::vector<oclMat> u2s;
 +
 +            oclMat I1x_buf;
 +            oclMat I1y_buf;
 +
 +            oclMat I1w_buf;
 +            oclMat I1wx_buf;
 +            oclMat I1wy_buf;
 +
 +            oclMat grad_buf;
 +            oclMat rho_c_buf;
 +
 +            oclMat p11_buf;
 +            oclMat p12_buf;
 +            oclMat p21_buf;
 +            oclMat p22_buf;
 +
 +            oclMat diff_buf;
 +            oclMat norm_buf;
 +        };
 +        // current supported sorting methods
 +        enum
 +        {
 +            SORT_BITONIC,   // only support power-of-2 buffer size
 +            SORT_SELECTION, // cannot sort duplicate keys
 +            SORT_MERGE,
 +            SORT_RADIX      // only support signed int/float keys(CV_32S/CV_32F)
 +        };
 +        //! Returns the sorted result of all the elements in input based on equivalent keys.
 +        //
 +        //  The element unit in the values to be sorted is determined from the data type,
 +        //  i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its
 +        //  matrix dimension.
 +        //  both keys and values will be sorted inplace
 +        //  Key needs to be single channel oclMat.
 +        //
 +        //  Example:
 +        //  input -
 +        //    keys   = {2,    3,   1}   (CV_8UC1)
 +        //    values = {10,5, 4,3, 6,2} (CV_8UC2)
 +        //  sortByKey(keys, values, SORT_SELECTION, false);
 +        //  output -
 +        //    keys   = {1,    2,   3}   (CV_8UC1)
 +        //    values = {6,2, 10,5, 4,3} (CV_8UC2)
 +        CV_EXPORTS void sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false);
 +        /*!Base class for MOG and MOG2!*/
 +        class CV_EXPORTS BackgroundSubtractor
 +        {
 +        public:
 +            //! the virtual destructor
 +            virtual ~BackgroundSubtractor();
 +            //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image.
 +            virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate);
 +
 +            //! computes a background image
 +            virtual void getBackgroundImage(oclMat& backgroundImage) const = 0;
 +        };
 +                /*!
 +        Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm
 +
 +        The class implements the following algorithm:
 +        "An improved adaptive background mixture model for real-time tracking with shadow detection"
 +        P. KadewTraKuPong and R. Bowden,
 +        Proc. 2nd European Workshp on Advanced Video-Based Surveillance Systems, 2001."
 +        http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/avbs01/avbs01.pdf
 +        */
 +        class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor
 +        {
 +        public:
 +            //! the default constructor
 +            MOG(int nmixtures = -1);
 +
 +            //! re-initiaization method
 +            void initialize(Size frameSize, int frameType);
 +
 +            //! the update operator
 +            void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f);
 +
 +            //! computes a background image which are the mean of all background gaussians
 +            void getBackgroundImage(oclMat& backgroundImage) const;
 +
 +            //! releases all inner buffers
 +            void release();
 +
 +            int history;
 +            float varThreshold;
 +            float backgroundRatio;
 +            float noiseSigma;
 +
 +        private:
 +            int nmixtures_;
 +
 +            Size frameSize_;
 +            int frameType_;
 +            int nframes_;
 +
 +            oclMat weight_;
 +            oclMat sortKey_;
 +            oclMat mean_;
 +            oclMat var_;
 +        };
 +
 +        /*!
 +        The class implements the following algorithm:
 +        "Improved adaptive Gausian mixture model for background subtraction"
 +        Z.Zivkovic
 +        International Conference Pattern Recognition, UK, August, 2004.
 +        http://www.zoranz.net/Publications/zivkovic2004ICPR.pdf
 +        */
 +        class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor
 +        {
 +        public:
 +            //! the default constructor
 +            MOG2(int nmixtures = -1);
 +
 +            //! re-initiaization method
 +            void initialize(Size frameSize, int frameType);
 +
 +            //! the update operator
 +            void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f);
 +
 +            //! computes a background image which are the mean of all background gaussians
 +            void getBackgroundImage(oclMat& backgroundImage) const;
 +
 +            //! releases all inner buffers
 +            void release();
 +
 +            // parameters
 +            // you should call initialize after parameters changes
 +
 +            int history;
 +
 +            //! here it is the maximum allowed number of mixture components.
 +            //! Actual number is determined dynamically per pixel
 +            float varThreshold;
 +            // threshold on the squared Mahalanobis distance to decide if it is well described
 +            // by the background model or not. Related to Cthr from the paper.
 +            // This does not influence the update of the background. A typical value could be 4 sigma
 +            // and that is varThreshold=4*4=16; Corresponds to Tb in the paper.
 +
 +            /////////////////////////
 +            // less important parameters - things you might change but be carefull
 +            ////////////////////////
 +
 +            float backgroundRatio;
 +            // corresponds to fTB=1-cf from the paper
 +            // TB - threshold when the component becomes significant enough to be included into
 +            // the background model. It is the TB=1-cf from the paper. So I use cf=0.1 => TB=0.
 +            // For alpha=0.001 it means that the mode should exist for approximately 105 frames before
 +            // it is considered foreground
 +            // float noiseSigma;
 +            float varThresholdGen;
 +
 +            //correspondts to Tg - threshold on the squared Mahalan. dist. to decide
 +            //when a sample is close to the existing components. If it is not close
 +            //to any a new component will be generated. I use 3 sigma => Tg=3*3=9.
 +            //Smaller Tg leads to more generated components and higher Tg might make
 +            //lead to small number of components but they can grow too large
 +            float fVarInit;
 +            float fVarMin;
 +            float fVarMax;
 +
 +            //initial variance  for the newly generated components.
 +            //It will will influence the speed of adaptation. A good guess should be made.
 +            //A simple way is to estimate the typical standard deviation from the images.
 +            //I used here 10 as a reasonable value
 +            // min and max can be used to further control the variance
 +            float fCT; //CT - complexity reduction prior
 +            //this is related to the number of samples needed to accept that a component
 +            //actually exists. We use CT=0.05 of all the samples. By setting CT=0 you get
 +            //the standard Stauffer&Grimson algorithm (maybe not exact but very similar)
 +
 +            //shadow detection parameters
 +            bool bShadowDetection; //default 1 - do shadow detection
 +            unsigned char nShadowDetection; //do shadow detection - insert this value as the detection result - 127 default value
 +            float fTau;
 +            // Tau - shadow threshold. The shadow is detected if the pixel is darker
 +            //version of the background. Tau is a threshold on how much darker the shadow can be.
 +            //Tau= 0.5 means that if pixel is more than 2 times darker then it is not shadow
 +            //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003.
 +
 +        private:
 +            int nmixtures_;
 +
 +            Size frameSize_;
 +            int frameType_;
 +            int nframes_;
 +
 +            oclMat weight_;
 +            oclMat variance_;
 +            oclMat mean_;
 +
 +            oclMat bgmodelUsedModes_; //keep track of number of modes per pixel
 +        };
 +
 +        /*!***************Kalman Filter*************!*/
 +        class CV_EXPORTS KalmanFilter
 +        {
 +        public:
 +            KalmanFilter();
 +            //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
 +            KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
 +            //! re-initializes Kalman filter. The previous content is destroyed.
 +            void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
 +
 +            const oclMat& predict(const oclMat& control=oclMat());
 +            const oclMat& correct(const oclMat& measurement);
 +
 +            oclMat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
 +            oclMat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
 +            oclMat transitionMatrix;   //!< state transition matrix (A)
 +            oclMat controlMatrix;      //!< control matrix (B) (not used if there is no control)
 +            oclMat measurementMatrix;  //!< measurement matrix (H)
 +            oclMat processNoiseCov;    //!< process noise covariance matrix (Q)
 +            oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
 +            oclMat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
 +            oclMat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
 +            oclMat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
 +        private:
 +            oclMat temp1;
 +            oclMat temp2;
 +            oclMat temp3;
 +            oclMat temp4;
 +            oclMat temp5;
 +        };
 +
 +        /*!***************K Nearest Neighbour*************!*/
 +        class CV_EXPORTS KNearestNeighbour: public CvKNearest
 +        {
 +        public:
 +            KNearestNeighbour();
 +            ~KNearestNeighbour();
 +
 +            bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
 +                bool isRegression = false, int max_k = 32, bool updateBase = false);
 +
 +            void clear();
 +
 +            void find_nearest(const oclMat& samples, int k, oclMat& lables);
 +
 +        private:
 +            oclMat samples_ocl;
 +        };
 +
 +        /*!***************  SVM  *************!*/
 +        class CV_EXPORTS CvSVM_OCL : public CvSVM
 +        {
 +        public:
 +            CvSVM_OCL();
 +
 +            CvSVM_OCL(const cv::Mat& trainData, const cv::Mat& responses,
 +                      const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
 +                      CvSVMParams params=CvSVMParams());
 +            CV_WRAP float predict( const int row_index, Mat& src, bool returnDFVal=false ) const;
 +            CV_WRAP void predict( cv::InputArray samples, cv::OutputArray results ) const;
 +            CV_WRAP float predict( const cv::Mat& sample, bool returnDFVal=false ) const;
 +            float predict( const CvMat* samples, CV_OUT CvMat* results ) const;
 +
 +        protected:
 +            float predict( const int row_index, int row_len, Mat& src, bool returnDFVal=false ) const;
 +            void create_kernel();
 +            void create_solver();
 +        };
 +
 +        /*!***************  END  *************!*/
 +    }
 +}
 +#if defined _MSC_VER && _MSC_VER >= 1200
 +#  pragma warning( push)
 +#  pragma warning( disable: 4267)
 +#endif
 +#include "opencv2/ocl/matrix_operations.hpp"
 +#if defined _MSC_VER && _MSC_VER >= 1200
 +#  pragma warning( pop)
 +#endif
 +
 +#endif /* __OPENCV_OCL_HPP__ */
diff --cc modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
index beb3d27,beb3d27..e384544
--- a/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
@@@ -21,7 -21,7 +21,7 @@@
  //
  //   * Redistribution's in binary form must reproduce the above copyright notice,
  //     this list of conditions and the following disclaimer in the documentation
--//     and/or other oclMaterials provided with the distribution.
++//     and/or other materials provided with the distribution.
  //
  //   * The name of the copyright holders may not be used to endorse or promote products
  //     derived from this software without specific prior written permission.
diff --cc modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
index 70c45d3,70c45d3..08f980f
--- a/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
@@@ -21,7 -21,7 +21,7 @@@
  //
  //   * Redistribution's in binary form must reproduce the above copyright notice,
  //     this list of conditions and the following disclaimer in the documentation
--//     and/or other oclMaterials provided with the distribution.
++//     and/or other materials provided with the distribution.
  //
  //   * The name of the copyright holders may not be used to endorse or promote products
  //     derived from this software without specific prior written permission.
diff --cc modules/ocl/include/opencv2/ocl/private/util.hpp
index 670b03c,88f603b..efb684c
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@@ -25,7 -25,7 +25,7 @@@
  //
  //   * Redistribution's in binary form must reproduce the above copyright notice,
  //     this list of conditions and the following disclaimer in the documentation
--//     and/or other oclMaterials provided with the distribution.
++//     and/or other materials provided with the distribution.
  //
  //   * The name of the copyright holders may not be used to endorse or promote products
  //     derived from this software without specific prior written permission.
@@@ -100,18 -100,22 +100,22 @@@ CV_EXPORTS void openCLFree(void *devPtr
  CV_EXPORTS cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
  CV_EXPORTS void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
  CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
 -        const cv::ocl::ProgramEntry* source, std::string kernelName);
 +        const cv::ocl::ProgramEntry* source, String kernelName);
  CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
 -        const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
 +        const cv::ocl::ProgramEntry* source, String kernelName, const char *build_options);
+ CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
 -        string kernelName, int channels, int depth, const char *build_options);
++        String kernelName, int channels, int depth, const char *build_options);
  CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+ CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
+                           size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
 -CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
 +CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, String kernelName, std::vector< std::pair<size_t, const void *> > &args,
          int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
 -CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,
 +CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName,
          size_t globalThreads[3], size_t localThreads[3],
          std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
 -CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3],
 +CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
 -CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3],
 +CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
          int depth, const char *build_options);
  
diff --cc modules/ocl/perf/perf_hough.cpp
index f259bd1,0000000..e90356a
mode 100644,000000..100644
--- a/modules/ocl/perf/perf_hough.cpp
+++ b/modules/ocl/perf/perf_hough.cpp
@@@ -1,106 -1,0 +1,106 @@@
 +/*M///////////////////////////////////////////////////////////////////////////////////////
 +//
 +//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 +//
 +//  By downloading, copying, installing or using the software you agree to this license.
 +//  If you do not agree to this license, do not download, install,
 +//  copy or use the software.
 +//
 +//
 +//                           License Agreement
 +//                For Open Source Computer Vision Library
 +//
 +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 +// Third party copyrights are property of their respective owners.
 +//
 +// Redistribution and use in source and binary forms, with or without modification,
 +// are permitted provided that the following conditions are met:
 +//
 +//   * Redistribution's of source code must retain the above copyright notice,
 +//     this list of conditions and the following disclaimer.
 +//
 +//   * Redistribution's in binary form must reproduce the above copyright notice,
 +//     this list of conditions and the following disclaimer in the documentation
- //     and/or other oclMaterials provided with the distribution.
++//     and/or other materials provided with the distribution.
 +//
 +//   * The name of the copyright holders may not be used to endorse or promote products
 +//     derived from this software without specific prior written permission.
 +//
 +// This software is provided by the copyright holders and contributors as is and
 +// any express or implied warranties, including, but not limited to, the implied
 +// warranties of merchantability and fitness for a particular purpose are disclaimed.
 +// In no event shall the Intel Corporation or contributors be liable for any direct,
 +// indirect, incidental, special, exemplary, or consequential damages
 +// (including, but not limited to, procurement of substitute goods or services;
 +// loss of use, data, or profits; or business interruption) however caused
 +// and on any theory of liability, whether in contract, strict liability,
 +// or tort (including negligence or otherwise) arising in any way out of
 +// the use of this software, even if advised of the possibility of such damage.
 +//
 +//M*/
 +
 +#include "perf_precomp.hpp"
 +
 +#ifdef HAVE_OPENCL
 +
 +using namespace cv;
 +using namespace perf;
 +
 +//////////////////////////////////////////////////////////////////////
 +// HoughCircles
 +
 +typedef std::tr1::tuple<cv::Size, float, float> Size_Dp_MinDist_t;
 +typedef perf::TestBaseWithParam<Size_Dp_MinDist_t> Size_Dp_MinDist;
 +
 +PERF_TEST_P(Size_Dp_MinDist, OCL_HoughCircles,
 +            testing::Combine(
 +                testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p),
 +                testing::Values(1.0f, 2.0f, 4.0f),
 +                testing::Values(1.0f, 10.0f)))
 +{
 +    const Size_Dp_MinDist_t params = GetParam();
 +    const cv::Size size = std::tr1::get<0>(params);
 +    const float dp      = std::tr1::get<1>(params);
 +    const float minDist = std::tr1::get<2>(params);
 +
 +    const int minRadius = 10;
 +    const int maxRadius = 30;
 +    const int cannyThreshold = 100;
 +    const int votesThreshold = 15;
 +
 +    cv::RNG rng(123456789);
 +
 +    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0)), circles;
 +
 +    const int numCircles = rng.uniform(50, 100);
 +    for (int i = 0; i < numCircles; ++i)
 +    {
 +        cv::Point center(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
 +        const int radius = rng.uniform(minRadius, maxRadius + 1);
 +
 +        cv::circle(src, center, radius, cv::Scalar::all(255), -1);
 +    }
 +
 +    declare.time(10.0).iterations(25);
 +
 +    if (RUN_OCL_IMPL)
 +    {
 +        cv::ocl::oclMat ocl_src(src), ocl_circles;
 +
 +        OCL_TEST_CYCLE() cv::ocl::HoughCircles(ocl_src, ocl_circles, HOUGH_GRADIENT, dp, minDist,
 +                                               cannyThreshold, votesThreshold, minRadius, maxRadius);
 +    }
 +    else if (RUN_PLAIN_IMPL)
 +    {
 +        TEST_CYCLE() cv::HoughCircles(src, circles, HOUGH_GRADIENT, dp, minDist, cannyThreshold,
 +                                      votesThreshold, minRadius, maxRadius);
 +    }
 +    else
 +        OCL_PERF_ELSE
 +
 +    int value = 0;
 +    SANITY_CHECK(value);
 +}
 +
 +#endif // HAVE_OPENCL
diff --cc modules/ocl/src/arithm.cpp
index 6bfa733,9b24b16..5bcfbe1
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@@ -472,21 -472,25 +472,25 @@@ static void arithmetic_minMax_run(cons
      const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
      const char * const channelMap[] = { " ", " ", "2", "4", "4" };
  
 -    ostringstream stream;
 +    std::ostringstream stream;
      stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
-     stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
-     stream << " -D MIN_VAL=" << (std::numeric_limits<T>::is_integer ?
-                   (WT)std::numeric_limits<T>::min() : -(WT)(std::numeric_limits<T>::max()));
 -    if (numeric_limits<T>::is_integer)
++    if (std::numeric_limits<T>::is_integer)
+     {
 -        stream << " -D MAX_VAL=" << (WT)numeric_limits<T>::max();
 -        stream << " -D MIN_VAL=" << (WT)numeric_limits<T>::min();
++        stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
++        stream << " -D MIN_VAL=" << (WT)std::numeric_limits<T>::min();
+     }
+     else
+         stream << " -D DEPTH_" << src.depth();
      std::string buildOptions = stream.str();
  
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
 +    std::vector<std::pair<size_t , const void *> > args;
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
  
      int minvalid_cols = 0, moffset = 0;
      if (!mask.empty())
@@@ -693,83 -697,47 +697,47 @@@ double cv::ocl::norm(const oclMat &src1
  ////////////////////////////////// flip //////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////
  
- static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kernelName)
- {
-     int channels = dst.oclchannels();
-     int depth = dst.depth();
+ enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
  
-     int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
-         {4, 4, 4, 4, 1, 1, 1},
-         {4, 4, 4, 4, 1, 1, 1},
-         {4, 4, 4, 4, 1, 1, 1}
-     };
- 
-     size_t vector_length = vector_lengths[channels - 1][depth];
-     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
- 
-     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-     int rows = divUp(dst.rows, 2);
- 
-     size_t localThreads[3]  = { 64, 4, 1 };
-     size_t globalThreads[3] = { cols, rows, 1 };
- 
-     int dst_step1 = dst.cols * dst.elemSize();
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
- 
-     openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, -1, depth);
- }
- 
- static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kernelName, bool isVertical)
 -static void arithmetic_flip_run(const oclMat &src, oclMat &dst, string kernelName, int flipType)
++static void arithmetic_flip_run(const oclMat &src, oclMat &dst, String kernelName, int flipType)
  {
-     int channels = dst.oclchannels();
-     int depth = dst.depth();
+     int cols = dst.cols, rows = dst.rows;
+     if ((cols == 1 && flipType == FLIP_COLS) ||
+             (rows == 1 && flipType == FLIP_ROWS) ||
+             (rows == 1 && cols == 1 && flipType == FLIP_BOTH))
+     {
+         src.copyTo(dst);
+         return;
+     }
  
-     int vector_lengths[4][7] = {{1, 1, 1, 1, 1, 1, 1},
-         {1, 1, 1, 1, 1, 1, 1},
-         {1, 1, 1, 1, 1, 1, 1},
-         {1, 1, 1, 1, 1, 1, 1}
-     };
+     cols = flipType == FLIP_COLS ? divUp(cols, 2) : cols;
+     rows = flipType & FLIP_ROWS ? divUp(rows, 2) : rows;
  
-     size_t vector_length = vector_lengths[channels - 1][depth];
-     int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-     int cols = divUp(dst.cols + offset_cols, vector_length);
-     cols = isVertical ? cols : divUp(cols, 2);
-     int rows = isVertical ?  divUp(dst.rows, 2) : dst.rows;
+     const char * const channelMap[] = { "", "", "2", "4", "4" };
+     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+     std::string buildOptions = format("-D T=%s%s", typeMap[dst.depth()], channelMap[dst.oclchannels()]);
  
      size_t localThreads[3]  = { 64, 4, 1 };
      size_t globalThreads[3] = { cols, rows, 1 };
  
-     int dst_step1 = dst.cols * dst.elemSize();
+     int elemSize = src.elemSize();
+     int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
+     int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
+ 
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
 +    std::vector<std::pair<size_t , const void *> > args;
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
  
-     if (isVertical)
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-     else
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
- 
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
- 
-     const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip;
- 
-     openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
+     openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args,
+                         -1, -1, buildOptions.c_str());
  }
  
  void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
diff --cc modules/ocl/src/blend.cpp
index c9bba13,a2b70f0..39f09c4
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@@@ -49,35 -49,51 +49,51 @@@
  using namespace cv;
  using namespace cv::ocl;
  
- void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2,
-                           oclMat &result)
+ void cv::ocl::blendLinear(const oclMat &src1, const oclMat &src2, const oclMat &weights1, const oclMat &weights2,
+                           oclMat &dst)
  {
-     cv::ocl::Context *ctx = img1.clCxt;
-     CV_Assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt);
-     int channels = img1.oclchannels();
-     int depth = img1.depth();
-     int rows = img1.rows;
-     int cols = img1.cols;
-     int istep = img1.step1();
-     int wstep = weights1.step1();
-     size_t globalSize[] = {cols * channels / 4, rows, 1};
-     size_t localSize[] = {256, 1, 1};
+     CV_Assert(src1.depth() <= CV_32F);
+     CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+     CV_Assert(weights1.size() == weights2.size() && weights1.size() == src1.size() &&
+               weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
+ 
+     dst.create(src1.size(), src1.type());
+ 
+     size_t globalSize[] = { dst.cols, dst.rows, 1};
+     size_t localSize[] = { 16, 16, 1 };
+ 
+     int depth = dst.depth(), ocn = dst.oclchannels();
+     int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize();
+     int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize();
+     int weight1_step = weights1.step / weights1.elemSize(), weight1_offset = weights1.offset / weights1.elemSize();
+     int weight2_step = weights2.step / weights2.elemSize(), weight2_offset = weights2.offset / weights2.elemSize();
+     int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
+ 
+     const char * const channelMap[] = { "", "", "2", "4", "4" };
+     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+     std::string buildOptions = format("-D T=%s%s -D convertToT=convert_%s%s%s -D FT=float%s -D convertToFT=convert_float%s",
+                                       typeMap[depth], channelMap[ocn], typeMap[depth], channelMap[ocn],
+                                       depth >= CV_32S ? "" : "_sat_rte", channelMap[ocn], channelMap[ocn]);
  
 -    vector< pair<size_t, const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&weights1.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&weight1_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&weight1_step ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&weights2.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&weight2_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&weight2_step ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
 +    std::vector< std::pair<size_t, const void *> > args;
-     result.create(img1.size(), CV_MAKE_TYPE(depth,img1.channels()));
-     if(globalSize[0] != 0)
-     {
-         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data ));
-         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img1.data ));
-         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img2.data ));
-         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
-         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&istep ));
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&wstep ));
-         String kernelName = "BlendLinear";
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_step ));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_step ));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
  
-         openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth);
-     }
+     openCLExecuteKernel(src1.clCxt, &blend_linear, "blendLinear", globalSize, localSize, args,
+                         -1, -1, buildOptions.c_str());
  }
diff --cc modules/ocl/src/canny.cpp
index 3f5de52,e0d788b..8c68d8b
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@@ -78,20 -78,10 +78,11 @@@ void cv::ocl::CannyBuf::create(const Si
              filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
          }
      }
 -    ensureSizeIsEnough(2 * (image_size.height + 2), image_size.width + 2, CV_32FC1, edgeBuf);
 +    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, magBuf);
 +    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, mapBuf);
  
-     ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
-     ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
- 
-     int counter_i [1] = { 0 };
-     int err = 0;
-     if(counter)
-     {
-         openCLFree(counter);
-     }
-     counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
-     openCLSafeCall(err);
+     ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
+     ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
  }
  
  void cv::ocl::CannyBuf::release()
@@@ -100,15 -90,9 +91,10 @@@
      dy.release();
      dx_buf.release();
      dy_buf.release();
 -    edgeBuf.release();
 +    magBuf.release();
 +    mapBuf.release();
      trackBuf1.release();
      trackBuf2.release();
-     if(counter)
-     {
-         openCLFree(counter);
-         counter = NULL;
-     }
  }
  
  namespace cv
@@@ -320,54 -312,61 +306,61 @@@ void canny::calcMap_gpu(oclMat &dx, ocl
      openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
  }
  
- void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
+ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols)
  {
      Context *clCxt = map.clCxt;
-     String kernelName = "edgesHysteresisLocal";
 -    vector< pair<size_t, const void *> > args;
 +    std::vector< std::pair<size_t, const void *> > args;
  
+     Mat counterMat(counter.rows, counter.cols, counter.type());
+     counterMat.at<int>(0, 0) = 0;
+     counter.upload(counterMat);
+ 
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
+     cl_int stepBytes = map.step;
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&stepBytes));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&stepBytes));
+     cl_int offsetBytes = map.offset;
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&offsetBytes));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&offsetBytes));
  
      size_t globalThreads[3] = {cols, rows, 1};
      size_t localThreads[3]  = {16, 16, 1};
  
-     openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+     openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1);
  }
  
- void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
+ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols)
  {
-     unsigned int count;
-     openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
      Context *clCxt = map.clCxt;
-     String kernelName = "edgesHysteresisGlobal";
 -    vector< pair<size_t, const void *> > args;
 +    std::vector< std::pair<size_t, const void *> > args;
      size_t localThreads[3]  = {128, 1, 1};
  
-     int count_i[1] = {0};
-     while(count > 0)
+     while(1 > 0)
      {
-         openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
+         Mat counterMat; counter.download(counterMat);
+         int count = counterMat.at<int>(0, 0);
+         CV_Assert(count >= 0);
+         if (count == 0)
+             break;
+ 
+         counterMat.at<int>(0, 0) = 0;
+         counter.upload(counterMat);
  
          args.clear();
-         size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
+         size_t globalThreads[3] = {std::min((unsigned)count, 65535u) * 128, divUp(count, 65535), 1};
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&count));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
 +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
 +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
 +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
-         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
++        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
 +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
 +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
 +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&count));
 +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
 +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
  
-         openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-         openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
+         openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1);
          std::swap(st1, st2);
      }
  }
diff --cc modules/ocl/src/cl_operations.cpp
index f83220d,d344689..5910d05
--- a/modules/ocl/src/cl_operations.cpp
+++ b/modules/ocl/src/cl_operations.cpp
@@@ -174,10 -224,62 +224,62 @@@ void openCLCopyBuffer2D(Context *ctx, v
  
  void openCLFree(void *devPtr)
  {
+ #ifdef CHECK_MEMORY_CORRUPTION
+     bool failBefore = false, failAfter = false;
+     CheckBuffers data;
+     std::map<cl_mem, CheckBuffers>::iterator i = __check_buffers.find((cl_mem)devPtr);
+     if (i != __check_buffers.end())
+     {
+         data = i->second;
+         Context* ctx = Context::getContext();
+         std::vector<uchar> checkBefore(__memory_corruption_check_bytes);
+         std::vector<uchar> checkAfter(__memory_corruption_check_bytes);
+         openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
+                 data.mainBuffer, CL_TRUE, 0, __memory_corruption_check_bytes, &checkBefore[0],
+                 0, NULL, NULL));
+         openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
+                 data.mainBuffer, CL_TRUE, __memory_corruption_check_bytes + data.size, __memory_corruption_check_bytes, &checkAfter[0],
+                 0, NULL, NULL));
+ 
+         std::vector<int> tmp(__memory_corruption_check_bytes / sizeof(int),
+                 __memory_corruption_check_pattern);
+ 
+         if (memcmp(&checkBefore[0], &tmp[0], __memory_corruption_check_bytes) != 0)
+         {
+             failBefore = true;
+         }
+         if (memcmp(&checkAfter[0], &tmp[0], __memory_corruption_check_bytes) != 0)
+         {
+             failAfter = true;
+         }
+         openCLSafeCall(clReleaseMemObject(data.mainBuffer));
+         __check_buffers.erase(i);
+     }
+ #endif
      openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
+ #ifdef CHECK_MEMORY_CORRUPTION
+     if (failBefore)
+     {
+ #ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
+         std::cerr << "ERROR: Memory corruption detected: before buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
+ #endif
+ #ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
+         CV_Error(CV_StsInternal, "Memory corruption detected: before buffer");
+ #endif
+     }
+     if (failAfter)
+     {
+ #ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
+         std::cerr << "ERROR: Memory corruption detected: after buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
+ #endif
+ #ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
+         CV_Error(CV_StsInternal, "Memory corruption detected: after buffer");
+ #endif
+     }
+ #endif
  }
  
 -cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName)
 +cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName)
  {
      return openCLGetKernelFromSource(ctx, source, kernelName, NULL);
  }
@@@ -234,8 -336,7 +336,7 @@@ static std::string removeDuplicatedWhit
      return opt;
  }
  
- void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-                           size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
 -cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels,
++cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, int channels,
                            int depth, const char *build_options)
  {
      //construct kernel name
@@@ -246,12 -347,16 +347,16 @@@
          idxStr << "_C" << channels;
      if(depth != -1)
          idxStr << "_D" << depth;
 -    kernelName += idxStr.str();
 +    kernelName = kernelName + idxStr.str();
  
-     cl_kernel kernel;
      std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
-     kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+     cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+     return kernel;
+ }
  
+ void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
 -                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args)
++                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args)
+ {
      if ( localThreads != NULL)
      {
          globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
@@@ -297,9 -402,18 +402,18 @@@
      openCLSafeCall(clReleaseKernel(kernel));
  }
  
 -void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
 -                          size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
++void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
++                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+                           int depth, const char *build_options)
+ {
+     cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
+ 
+     openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
+ }
+ 
 -void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
 +void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
                           size_t globalThreads[3], size_t localThreads[3],
 -                         vector< pair<size_t, const void *> > &args, int channels, int depth)
 +                         std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
  {
      openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args,
                          channels, depth, NULL);
diff --cc modules/ocl/src/filtering.cpp
index 816988d,59146c1..305c723
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@@ -452,7 -423,7 +424,8 @@@ void morphOp(int op, const oclMat &src
      else
          kernel = _kernel;
  
-     Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
 -    Ptr<MorphologyFilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
++    Ptr<MorphologyFilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations)
++            .staticCast<MorphologyFilterEngine_GPU>();
  
      f->apply(src, dst);
  }
@@@ -550,99 -547,165 +549,165 @@@ static void GPUFilter2D(const oclMat &s
      CV_Assert(src.clCxt == dst.clCxt);
      CV_Assert((src.cols == dst.cols) &&
                (src.rows == dst.rows));
-     CV_Assert((src.oclchannels() == dst.oclchannels()));
-     CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
-     CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
-     CV_Assert(ksize.width == ksize.height);
-     Context *clCxt = src.clCxt;
- 
-     int filterWidth = ksize.width;
-     bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4 && src.type() != CV_32FC3; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
+     CV_Assert(src.oclchannels() == dst.oclchannels());
  
-     String kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
+     CV_Assert(kernel.cols == ksize.width && kernel.rows == ksize.height);
+     CV_Assert(kernel.channels() == 1);
  
-     size_t src_offset_x = (src.offset % src.step) / src.elemSize();
-     size_t src_offset_y = src.offset / src.step;
+     CV_Assert(anchor.x >= 0 && anchor.x < kernel.cols);
+     CV_Assert(anchor.y >= 0 && anchor.y < kernel.rows);
  
-     size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize();
-     size_t dst_offset_y = dst.offset / dst.step;
+     bool useDouble = src.depth() == CV_64F;
  
-     int paddingPixels = filterWidth & (-2);
+     std::vector<float> kernelDataFloat;
+     std::vector<double> kernelDataDouble;
+     int kernel_size_y2_aligned = useDouble ?
+             _prepareKernelFilter2D<double>(kernelDataDouble, kernel)
+             : _prepareKernelFilter2D<float>(kernelDataFloat, kernel);
+     oclMat oclKernelParameter;
+     if (useDouble)
+     {
+         oclKernelParameter.createEx(1, kernelDataDouble.size(), CV_64FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
+         openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataDouble.size()*sizeof(double),
+                 &kernelDataDouble[0], kernelDataDouble.size()*sizeof(double),
+                 kernelDataDouble.size()*sizeof(double), 1, clMemcpyHostToDevice);
+     }
+     else
+     {
+         oclKernelParameter.createEx(1, kernelDataFloat.size(), CV_32FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
+         openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataFloat.size()*sizeof(float),
+                 &kernelDataFloat[0], kernelDataFloat.size()*sizeof(float),
+                 kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
+     }
  
-     size_t localThreads[3]  = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1};
-     size_t globalThreads[3] = {src.wholecols, src.wholerows, 1};
+     size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+     do {
+         size_t BLOCK_SIZE = tryWorkItems;
+         while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
+             BLOCK_SIZE /= 2;
+ #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
+         size_t BLOCK_SIZE_Y = 1;
+ #else
+         size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+         while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+             BLOCK_SIZE_Y *= 2;
+ #endif
+ 
+         CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+ 
+         bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+ 
 -        vector<pair<size_t , const void *> > args;
++        std::vector<std::pair<size_t , const void *> > args;
+ 
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
++        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
+         cl_uint stepBytes = src.step;
 -        args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
++        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
+         int offsetXBytes = src.offset % src.step;
+         int offsetX = offsetXBytes / src.elemSize();
+         CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+         int offsetY = src.offset / src.step;
+         int endX = (offsetX + src.cols);
+         int endY = (offsetY + src.rows);
+         cl_int rect[4] = {offsetX, offsetY, endX, endY};
+         if (!isIsolatedBorder)
+         {
+             rect[2] = src.wholecols;
+             rect[3] = src.wholerows;
+         }
 -        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
++        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+ 
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
++        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
+         cl_uint _stepBytes = dst.step;
 -        args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
++        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+         int _offsetXBytes = dst.offset % dst.step;
+         int _offsetX = _offsetXBytes / dst.elemSize();
+         CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+         int _offsetY = dst.offset / dst.step;
+         int _endX = (_offsetX + dst.cols);
+         int _endY = (_offsetY + dst.rows);
+         cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
 -        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
++        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+ 
+         float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+         double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+         if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+         {
+             if (useDouble)
 -                args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
++                args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+             else
 -                args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
++                args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+         }
  
-     int cn =  src.oclchannels();
-     int src_step = (int)(src.step/src.elemSize());
-     int dst_step = (int)(dst.step/src.elemSize());
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
++        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
  
-     int localWidth = localThreads[0] + paddingPixels;
-     int localHeight = localThreads[1] + paddingPixels;
+         const char* btype = NULL;
  
-     size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
+         switch (borderType & ~BORDER_ISOLATED)
+         {
+         case BORDER_CONSTANT:
+             btype = "BORDER_CONSTANT";
+             break;
+         case BORDER_REPLICATE:
+             btype = "BORDER_REPLICATE";
+             break;
+         case BORDER_REFLECT:
+             btype = "BORDER_REFLECT";
+             break;
+         case BORDER_WRAP:
+             CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+             return;
+         case BORDER_REFLECT101:
+             btype = "BORDER_REFLECT_101";
+             break;
+         }
  
-     int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
-     {4, 4, 1, 1, 1, 1, 1},
-     {1, 1, 1, 1, 1, 1, 1},
-     {4, 4, 4, 4, 1, 1, 4}
-     };
-     int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1));
+         int requiredTop = anchor.y;
+         int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+         int requiredBottom = ksize.height - 1 - anchor.y;
+         int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+         int h = isIsolatedBorder ? src.rows : src.wholerows;
+         int w = isIsolatedBorder ? src.cols : src.wholecols;
+         bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+ 
+         char build_options[1024];
+         sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
+                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
+                 "-D %s -D %s -D %s",
+                 (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+                 src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+                 anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
+                 btype,
+                 extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                 isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+ 
+         size_t lt[3] = {BLOCK_SIZE, 1, 1};
+         size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
+ 
+         cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
+ 
+         size_t kernelWorkGroupSize;
+         openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
+                                                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+         if (lt[0] > kernelWorkGroupSize)
+         {
+             clReleaseKernel(kernel);
+             CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
+             tryWorkItems = kernelWorkGroupSize;
+             continue;
+         }
  
-     std::vector< std::pair<size_t, const void *> > args;
-     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_step));
-     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
-     args.push_back(std::make_pair(localMemSize,   (void *)NULL));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_x));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_y));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_x));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_y));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols));
-     char btype[30];
-     switch (borderType)
-     {
-     case 0:
-         sprintf(btype, "BORDER_CONSTANT");
-         break;
-     case 1:
-         sprintf(btype, "BORDER_REPLICATE");
-         break;
-     case 2:
-         sprintf(btype, "BORDER_REFLECT");
-         break;
-     case 3:
-         CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-         return;
-     case 4:
-         sprintf(btype, "BORDER_REFLECT_101");
-         break;
-     }
-     int type = src.depth();
-     char build_options[150];
-     sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width);
-     openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+         openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
+     } while (false);
  }
  
- Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
+ Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
          const Point &anchor, int borderType)
  {
-     static const GPUFilter2D_t GPUFilter2D_callers[] = {0, GPUFilter2D, 0, GPUFilter2D, GPUFilter2D};
- 
-     CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
- 
-     oclMat gpu_krnl;
      Point norm_archor = anchor;
-     normalizeKernel(kernel, gpu_krnl, CV_32FC1);
      normalizeAnchor(norm_archor, ksize);
  
-     return makePtr<LinearFilter_GPU>(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
-         borderType);
+     return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, norm_archor, kernel, GPUFilter2D,
+                                borderType));
  }
  
  Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor,
@@@ -711,15 -776,10 +778,10 @@@ public
  Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
          const Ptr<BaseColumnFilter_GPU> &columnFilter)
  {
 -    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
 +    return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
  }
  
- /*
- **data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
- **support four border types: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101
- */
- 
- static void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst,
+ static void GPUFilterBox(const oclMat &src, oclMat &dst,
                           Size &ksize, const Point anchor, const int borderType)
  {
      //Normalize the result by default
@@@ -728,262 -788,137 +790,137 @@@
      CV_Assert(src.clCxt == dst.clCxt);
      CV_Assert((src.cols == dst.cols) &&
                (src.rows == dst.rows));
-     Context *clCxt = src.clCxt;
- 
-     String kernelName = "boxFilter_C1_D0";
- 
-     char btype[30];
- 
-     switch (borderType)
-     {
-     case 0:
-         sprintf(btype, "BORDER_CONSTANT");
-         break;
-     case 1:
-         sprintf(btype, "BORDER_REPLICATE");
-         break;
-     case 2:
-         sprintf(btype, "BORDER_REFLECT");
-         break;
-     case 3:
-         CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-         return;
-     case 4:
-         sprintf(btype, "BORDER_REFLECT_101");
-         break;
-     }
- 
-     char build_options[150];
-     sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
- 
-     size_t blockSizeX = 256, blockSizeY = 1;
-     size_t gSize = blockSizeX - (ksize.width - 1);
-     size_t threads = (dst.offset % dst.step % 4 + dst.cols + 3) / 4;
-     size_t globalSizeX = threads % gSize == 0 ? threads / gSize * blockSizeX : (threads / gSize + 1) * blockSizeX;
-     size_t globalSizeY = ((dst.rows + 1) / 2) % blockSizeY == 0 ? ((dst.rows + 1) / 2) : (((dst.rows + 1) / 2) / blockSizeY + 1) * blockSizeY;
- 
-     size_t globalThreads[3] = { globalSizeX, globalSizeY, 1 };
-     size_t localThreads[3]  = { blockSizeX, blockSizeY, 1 };
- 
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-     args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
- 
-     openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
- }
- 
- static void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst,
-                          Size &ksize, const Point anchor, const int borderType)
- {
-     //Normalize the result by default
-     float alpha = ksize.height * ksize.width;
- 
-     CV_Assert(src.clCxt == dst.clCxt);
-     CV_Assert((src.cols == dst.cols) &&
-               (src.rows == dst.rows));
-     Context *clCxt = src.clCxt;
- 
-     String kernelName = "boxFilter_C4_D0";
- 
-     char btype[30];
- 
-     switch (borderType)
-     {
-     case 0:
-         sprintf(btype, "BORDER_CONSTANT");
-         break;
-     case 1:
-         sprintf(btype, "BORDER_REPLICATE");
-         break;
-     case 2:
-         sprintf(btype, "BORDER_REFLECT");
-         break;
-     case 3:
-         CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-         return;
-     case 4:
-         sprintf(btype, "BORDER_REFLECT_101");
-         break;
-     }
- 
-     char build_options[150];
-     sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
- 
-     size_t blockSizeX = 256, blockSizeY = 1;
-     size_t gSize = blockSizeX - ksize.width / 2 * 2;
-     size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
-     size_t rows_per_thread = 2;
-     size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
- 
-     size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-     size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
- 
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-     args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
- 
-     openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
- }
- 
- static void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst,
-                           Size &ksize, const Point anchor, const int borderType)
- {
-     //Normalize the result by default
-     float alpha = ksize.height * ksize.width;
- 
-     CV_Assert(src.clCxt == dst.clCxt);
-     CV_Assert((src.cols == dst.cols) &&
-               (src.rows == dst.rows));
-     Context *clCxt = src.clCxt;
- 
-     String kernelName = "boxFilter_C1_D5";
- 
-     char btype[30];
- 
-     switch (borderType)
-     {
-     case 0:
-         sprintf(btype, "BORDER_CONSTANT");
-         break;
-     case 1:
-         sprintf(btype, "BORDER_REPLICATE");
-         break;
-     case 2:
-         sprintf(btype, "BORDER_REFLECT");
-         break;
-     case 3:
-         CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-         return;
-     case 4:
-         sprintf(btype, "BORDER_REFLECT_101");
-         break;
-     }
- 
-     char build_options[150];
-     sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
- 
-     size_t blockSizeX = 256, blockSizeY = 1;
-     size_t gSize = blockSizeX - ksize.width / 2 * 2;
-     size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
-     size_t rows_per_thread = 2;
-     size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
- 
- 
-     size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-     size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
- 
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-     args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
- 
-     openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
- }
- 
- static void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst,
-                           Size &ksize, const Point anchor, const int borderType)
- {
-     //Normalize the result by default
-     float alpha = ksize.height * ksize.width;
- 
-     CV_Assert(src.clCxt == dst.clCxt);
-     CV_Assert((src.cols == dst.cols) &&
-               (src.rows == dst.rows));
-     Context *clCxt = src.clCxt;
- 
-     String kernelName = "boxFilter_C4_D5";
- 
-     char btype[30];
- 
-     switch (borderType)
-     {
-     case 0:
-         sprintf(btype, "BORDER_CONSTANT");
-         break;
-     case 1:
-         sprintf(btype, "BORDER_REPLICATE");
-         break;
-     case 2:
-         sprintf(btype, "BORDER_REFLECT");
-         break;
-     case 3:
-         CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-         return;
-     case 4:
-         sprintf(btype, "BORDER_REFLECT_101");
-         break;
-     }
+     CV_Assert(src.oclchannels() == dst.oclchannels());
  
-     char build_options[150];
-     sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
+     size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+     do {
+         size_t BLOCK_SIZE = tryWorkItems;
+         while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
+             BLOCK_SIZE /= 2;
+         size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+         while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+             BLOCK_SIZE_Y *= 2;
+ 
+         CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+ 
+         bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+ 
 -        vector<pair<size_t , const void *> > args;
++        std::vector<std::pair<size_t , const void *> > args;
+ 
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
++        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
+         cl_uint stepBytes = src.step;
 -        args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
++        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
+         int offsetXBytes = src.offset % src.step;
+         int offsetX = offsetXBytes / src.elemSize();
+         CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+         int offsetY = src.offset / src.step;
+         int endX = (offsetX + src.cols);
+         int endY = (offsetY + src.rows);
+         cl_int rect[4] = {offsetX, offsetY, endX, endY};
+         if (!isIsolatedBorder)
+         {
+             rect[2] = src.wholecols;
+             rect[3] = src.wholerows;
+         }
 -        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
++        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+ 
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
++        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
+         cl_uint _stepBytes = dst.step;
 -        args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
++        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+         int _offsetXBytes = dst.offset % dst.step;
+         int _offsetX = _offsetXBytes / dst.elemSize();
+         CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+         int _offsetY = dst.offset / dst.step;
+         int _endX = (_offsetX + dst.cols);
+         int _endY = (_offsetY + dst.rows);
+         cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
 -        args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
++        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+ 
+         bool useDouble = src.depth() == CV_64F;
+ 
+         float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+         double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+         if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+         {
+             if (useDouble)
 -                args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
++                args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+             else
 -                args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
++                args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+         }
  
-     size_t blockSizeX = 256, blockSizeY = 1;
-     size_t gSize = blockSizeX - ksize.width / 2 * 2;
-     size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
-     size_t rows_per_thread = 2;
-     size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
+         double alphaDouble = alpha; // DON'T move into 'if' body
+         if (useDouble)
 -            args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
++            args.push_back( std::make_pair( sizeof(double), (void *)&alphaDouble));
+         else
 -            args.push_back( make_pair( sizeof(float), (void *)&alpha));
++            args.push_back( std::make_pair( sizeof(float), (void *)&alpha));
  
+         const char* btype = NULL;
  
-     size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-     size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
+         switch (borderType & ~BORDER_ISOLATED)
+         {
+         case BORDER_CONSTANT:
+             btype = "BORDER_CONSTANT";
+             break;
+         case BORDER_REPLICATE:
+             btype = "BORDER_REPLICATE";
+             break;
+         case BORDER_REFLECT:
+             btype = "BORDER_REFLECT";
+             break;
+         case BORDER_WRAP:
+             CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+             return;
+         case BORDER_REFLECT101:
+             btype = "BORDER_REFLECT_101";
+             break;
+         }
  
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-     args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
+         int requiredTop = anchor.y;
+         int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+         int requiredBottom = ksize.height - 1 - anchor.y;
+         int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+         int h = isIsolatedBorder ? src.rows : src.wholerows;
+         int w = isIsolatedBorder ? src.cols : src.wholecols;
+         bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+ 
+         CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
+ 
+         char build_options[1024];
+         sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
+                 (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+                 src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+                 anchor.x, anchor.y, ksize.width, ksize.height,
+                 btype,
+                 extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                 isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+ 
+         size_t lt[3] = {BLOCK_SIZE, 1, 1};
+         size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
+ 
+         cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
+ 
+         size_t kernelWorkGroupSize;
+         openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
+                                                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+         if (lt[0] > kernelWorkGroupSize)
+         {
+             clReleaseKernel(kernel);
+             CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
+             tryWorkItems = kernelWorkGroupSize;
+             continue;
+         }
  
-     openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+         openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
+     } while (false);
  }
  
- 
- Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int srcType, int dstType,
+ Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
          const Size &ksize, Point anchor, int borderType)
  {
-     static const FilterBox_t FilterBox_callers[2][5] = {{0, GPUFilterBox_8u_C1R, 0, GPUFilterBox_8u_C4R, GPUFilterBox_8u_C4R},
-         {0, GPUFilterBox_32F_C1R, 0, GPUFilterBox_32F_C4R, GPUFilterBox_32F_C4R}
-     };
-     //Remove this check if more data types need to be supported.
-     CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 ||
-                srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
- 
      normalizeAnchor(anchor, ksize);
  
-     return makePtr<GPUBoxFilter>(ksize, anchor,
-         borderType, FilterBox_callers[(CV_MAT_DEPTH(srcType) == CV_32F)][CV_MAT_CN(srcType)]);
+     return Ptr<BaseFilter_GPU>(new GPUBoxFilter(ksize, anchor,
+                                borderType, GPUFilterBox));
  }
  
  Ptr<FilterEngine_GPU> cv::ocl::createBoxFilter_GPU(int srcType, int dstType,
@@@ -1373,11 -1308,14 +1310,14 @@@ void cv::ocl::Scharr(const oclMat &src
      sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
  }
  
- void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
+ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale,
+         double delta, int borderType)
  {
+     CV_Assert(delta == 0);
+ 
      if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
      {
 -        CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
 +        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
          return;
      }
  
diff --cc modules/ocl/src/haar.cpp
index 8116496,31f6742..fd67daf
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@@ -831,34 -831,156 +831,156 @@@ void OclCascadeClassifier::detectMultiS
          pq.s[3] = gcascade->pq3;
          float correction = gcascade->inv_window_area;
  
 -        vector<pair<size_t, const void *> > args;
 -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
 -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
 -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
 -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
 -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
 -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
 -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
 -        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
 -        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
 -        args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
 +        std::vector<std::pair<size_t, const void *> > args;
 +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
 +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
 +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
 +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
 +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
 +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&pixelstep ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
 +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode ));
 +        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&p ));
 +        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
 +        args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
  
-         const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+         if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+         {
+             //setup local group size
+             localThreads[0] = 8;
+             localThreads[1] = 16;
+             localThreads[2] = 1;
+ 
+             //init maximal number of workgroups
+             int WGNumX = 1+(sizev[0].width /(localThreads[0]));
+             int WGNumY = 1+(sizev[0].height/(localThreads[1]));
+             int WGNumZ = loopcount;
+             int WGNum = 0; //accurate number of non -empty workgroups
+             oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
+             {
+                 cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
+                 openCLVerifyCall(status);
+                 for(int z=0;z<WGNumZ;++z)
+                 {
+                     int     Width  = (scaleinfo[z].width_height >> 16)&0xFFFF;
+                     int     Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
+                     for(int y=0;y<WGNumY;++y)
+                     {
+                         int     gy = y*localThreads[1];
+                         if(gy>=(Height-cascade->orig_window_size.height))
+                             continue; // no data to process
+                         for(int x=0;x<WGNumX;++x)
+                         {
+                             int     gx = x*localThreads[0];
+                             if(gx>=(Width-cascade->orig_window_size.width))
+                                 continue; // no data to process
+ 
+                             // save no-empty workgroup info into array
+                             pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
+                             pWGInfo[WGNum].s[1] = (gx << 16) | gy;
+                             pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
+                             memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
+                             WGNum++;
+                         }
+                     }
+                 }
+                 openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
+                 pWGInfo = NULL;
+             }
  
-         openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
+             // setup global sizes to have linear array of workgroups with WGNum size
+             globalThreads[0] = localThreads[0]*WGNum;
+             globalThreads[1] = localThreads[1];
+             globalThreads[2] = 1;
  
-         openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+ #define NODE_SIZE 12
+             // pack node info to have less memory loads
+             oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
+             {
+                 cl_int  status;
+                 cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
+                 openCLVerifyCall(status);
+                 //use known local data stride to precalulate indexes
+                 int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
+                 // check that maximal value is less than maximal unsigned short
+                 assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX);
+                 for(int i = 0;i<nodenum;++i)
+                 {//process each node from classifier
+                     struct NodePK
+                     {
+                         unsigned short  slm_index[3][4];
+                         float           weight[3];
+                         float           threshold;
+                         float           alpha[2];
+                     };
+                     struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
+                     for(int k=0;k<3;++k)
+                     {// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
+                         int* p = &(node[i].p[k][0]);
+                         pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
+                         pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
+                         pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
+                         pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
+                     }
+                     //store used float point values for each node
+                     pOut->weight[0] = node[i].weight[0];
+                     pOut->weight[1] = node[i].weight[1];
+                     pOut->weight[2] = node[i].weight[2];
+                     pOut->threshold = node[i].threshold;
+                     pOut->alpha[0] = node[i].alpha[0];
 -                    pOut->alpha[1] = node[i].alpha[1];
++                   pOut->alpha[1] = node[i].alpha[1];
+                 }
+                 openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
+                 pNodesPK = NULL;
+             }
+             // add 2 additional buffers (WGinfo and packed nodes) as 2 last args
 -            args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
 -            args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
++            args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
++            args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
+ 
+             //form build options for kernel
 -            string  options = "-D PACKED_CLASSIFIER";
 -            options += format(" -D NODE_SIZE=%d",NODE_SIZE);
 -            options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
 -            options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
 -            options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
 -            options += format(" -D LSx=%d",localThreads[0]);
 -            options += format(" -D LSy=%d",localThreads[1]);
 -            options += format(" -D SPLITNODE=%d",splitnode);
 -            options += format(" -D SPLITSTAGE=%d",splitstage);
 -            options += format(" -D OUTPUTSZ=%d",outputsz);
++            String  options = "-D PACKED_CLASSIFIER";
++            options = options + format(" -D NODE_SIZE=%d",NODE_SIZE);
++            options = options + format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
++            options = options + format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
++            options = options + format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
++            options = options + format(" -D LSx=%d",localThreads[0]);
++            options = options + format(" -D LSy=%d",localThreads[1]);
++            options = options + format(" -D SPLITNODE=%d",splitnode);
++            options = options + format(" -D SPLITSTAGE=%d",splitstage);
++            options = options + format(" -D OUTPUTSZ=%d",outputsz);
+ 
+             // init candiate global count by 0
+             int pattern = 0;
+             openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
+             // execute face detector
+             openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+             //read candidate buffer back and put it into host list
+             openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+             assert(candidate[0]<outputsz);
+             //printf("candidate[0]=%d\n",candidate[0]);
+             for(int i = 1; i <= candidate[0]; i++)
+             {
+                 allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
+             }
+         }
+         else
+         {
+             const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+ 
+             openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
  
-         for(int i = 0; i < outputsz; i++)
-             if(candidate[4 * i + 2] != 0)
-                 allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
-                 candidate[4 * i + 2], candidate[4 * i + 3]));
+             openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+ 
+             for(int i = 0; i < outputsz; i++)
+                 if(candidate[4 * i + 2] != 0)
+                     allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
+                     candidate[4 * i + 2], candidate[4 * i + 3]));
+         }
  
          free(scaleinfo);
          free(candidate);
diff --cc modules/ocl/src/imgproc.cpp
index ed39868,3539dfa..96bdb91
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@@ -99,79 -98,85 +99,85 @@@ namespace c
          /////////////////////////////////////////////////////////////////////////////////////
          // threshold
  
-         typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
- 
-         static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+         static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
          {
-             uchar thresh_uchar = cvFloor(thresh);
-             uchar max_val = cvRound(maxVal);
+             CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
  
-             size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
-             size_t bSizeX = 16, bSizeY = 16;
-             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
-             size_t gSizeY = dst.rows;
-             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
-             size_t localThreads[3] = {bSizeX, bSizeY, 1};
+             static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
+                                        sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
  
-             std::vector< std::pair<size_t, const void *> > args;
-             args.push_back( std::make_pair(sizeof(cl_mem), &src.data));
-             args.push_back( std::make_pair(sizeof(cl_mem), &dst.data));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.offset));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.step));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
-             args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
-             args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&max_val));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&type));
-             openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
+             int elemSize1 = sizeMap[depth];
+             int bufSize = elemSize1 * ocn;
+             std::vector<uchar> _buf(bufSize);
+             uchar * buf = &_buf[0];
+             scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
+             memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
+ 
+             return _buf;
          }
  
-         static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+         static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
          {
-             float thresh_f = thresh;
-             float max_val = maxVal;
-             int dst_offset = (dst.offset >> 2);
-             int dst_step = (dst.step >> 2);
-             int src_offset = (src.offset >> 2);
-             int src_step = (src.step >> 2);
- 
-             size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
-             size_t bSizeX = 16, bSizeY = 16;
-             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
-             size_t gSizeY = dst.rows;
-             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
-             size_t localThreads[3] = {bSizeX, bSizeY, 1};
+             bool ival = src.depth() < CV_32F;
+             int cn = src.channels(), vecSize = 4, depth = src.depth();
+             std::vector<uchar> thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(),
+                                                                dst.oclchannels(), dst.channels());
+             std::vector<uchar> maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels());
+ 
+             const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
+                                                   "THRESH_TOZERO", "THRESH_TOZERO_INV" };
+             const char * const channelMap[] = { "", "", "2", "4", "4" };
+             const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+             std::string buildOptions = format("-D T=%s%s -D %s", typeMap[depth], channelMap[cn], thresholdMap[thresholdType]);
+ 
+             int elemSize = src.elemSize();
+             int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
+             int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
  
 -            vector< pair<size_t, const void *> > args;
 -            args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
 -            args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
 -            args.push_back( make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
 -            args.push_back( make_pair(maxValue.size(), (void *)&maxValue[0]));
 +            std::vector< std::pair<size_t, const void *> > args;
-             args.push_back( std::make_pair(sizeof(cl_mem), &src.data));
-             args.push_back( std::make_pair(sizeof(cl_mem), &dst.data));
++            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
++            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
-             args.push_back( std::make_pair(sizeof(cl_float), (void *)&thresh_f));
-             args.push_back( std::make_pair(sizeof(cl_float), (void *)&max_val));
-             args.push_back( std::make_pair(sizeof(cl_int), (void *)&type));
++            args.push_back( std::make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
++            args.push_back( std::make_pair(maxValue.size(), (void *)&maxValue[0]));
+ 
+             int max_index = dst.cols, cols = dst.cols;
+             if (cn == 1 && vecSize > 1)
+             {
+                 CV_Assert(((vecSize - 1) & vecSize) == 0 && vecSize <= 16);
+                 cols = divUp(cols, vecSize);
+                 buildOptions += format(" -D VECTORIZED -D VT=%s%d -D VLOADN=vload%d -D VECSIZE=%d -D VSTOREN=vstore%d",
+                                        typeMap[depth], vecSize, vecSize, vecSize, vecSize);
+ 
+                 int vecSizeBytes = vecSize * dst.elemSize1();
+                 if ((dst.offset % dst.step) % vecSizeBytes == 0 && dst.step % vecSizeBytes == 0)
+                     buildOptions += " -D DST_ALIGNED";
+                 if ((src.offset % src.step) % vecSizeBytes == 0 && src.step % vecSizeBytes == 0)
+                     buildOptions += " -D SRC_ALIGNED";
+ 
 -                args.push_back( make_pair(sizeof(cl_int), (void *)&max_index));
++                args.push_back( std::make_pair(sizeof(cl_int), (void *)&max_index));
+             }
  
-             openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
+ 
+             size_t localThreads[3] = { 16, 16, 1 };
+             size_t globalThreads[3] = { cols, dst.rows, 1 };
+ 
+             openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args,
+                                 -1, -1, buildOptions.c_str());
          }
  
-         // threshold: support 8UC1 and 32FC1 data type and five threshold type
-         double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
+         double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
          {
-             //TODO: These limitations shall be removed later.
-             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
-             CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
-                       || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
+             CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC
+                       || thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV);
  
-             static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
- 
-             dst.create( src.size(), src.type() );
-             gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
+             dst.create(src.size(), src.type());
+             threshold_runner(src, dst, thresh, maxVal, thresholdType);
  
              return thresh;
          }
@@@ -891,8 -895,60 +897,60 @@@
  
              if (ksize > 0)
              {
-                 Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
-                 Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+                 Context* clCxt = Context::getContext();
+                 if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
+                     src.cols % 8 == 0 && src.rows % 8 == 0 &&
+                     ksize==3 &&
+                     (borderType ==cv::BORDER_REFLECT ||
+                      borderType == cv::BORDER_REPLICATE ||
+                      borderType ==cv::BORDER_REFLECT101 ||
+                      borderType ==cv::BORDER_WRAP))
+                 {
+                     Dx.create(src.size(), CV_32FC1);
+                     Dy.create(src.size(), CV_32FC1);
+ 
+                     const unsigned int block_x = 8;
+                     const unsigned int block_y = 8;
+ 
+                     unsigned int src_pitch = src.step;
+                     unsigned int dst_pitch = Dx.cols;
+ 
+                     float _scale = scale;
+ 
+                     std::vector<std::pair<size_t , const void *> > args;
+                     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+                     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
+                     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
+                     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+                     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+                     args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+                     args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
+                     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
+                     size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
+ 
 -                    string option = "-D BLK_X=8 -D BLK_Y=8";
++                    String option = "-D BLK_X=8 -D BLK_Y=8";
+                     switch(borderType)
+                     {
+                     case cv::BORDER_REPLICATE:
 -                        option += " -D BORDER_REPLICATE";
++                        option = option + " -D BORDER_REPLICATE";
+                         break;
+                     case cv::BORDER_REFLECT:
 -                        option += " -D BORDER_REFLECT";
++                        option = option + " -D BORDER_REFLECT";
+                         break;
+                     case cv::BORDER_REFLECT101:
 -                        option += " -D BORDER_REFLECT101";
++                        option = option + " -D BORDER_REFLECT101";
+                         break;
+                     case cv::BORDER_WRAP:
 -                        option += " -D BORDER_WRAP";
++                        option = option + " -D BORDER_WRAP";
+                         break;
+                     }
+                     openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
+                 }
+                 else
+                 {
+                     Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
+                     Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+                 }
              }
              else
              {
@@@ -937,23 -993,24 +995,24 @@@
  
              size_t gt[3] = { globalSizeX, globalSizeY, 1 };
              size_t lt[3]  = { blockSizeX, blockSizeY, 1 };
 -            vector<pair<size_t , const void *> > args;
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dy.data));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&Dx.step));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&Dy.step));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 -            args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
 +            std::vector<std::pair<size_t , const void *> > args;
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&Dx.step));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&Dy.step));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
 +            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
 +            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&k));
+ 
              openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, buildOptions.c_str());
          }
  
@@@ -969,7 -1026,7 +1028,7 @@@
          {
              if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
              {
-                 CV_Error(Error::OpenCLDoubleNotSupported, "Select device doesn't support double");
 -                CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
++                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
                  return;
              }
  
@@@ -991,7 -1048,7 +1050,7 @@@
          {
              if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
              {
-                 CV_Error(Error::OpenCLDoubleNotSupported, "select device don't support double");
 -                CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
++                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
                  return;
              }
  
diff --cc modules/ocl/src/kmeans.cpp
index 5486aa4,58a68a7..52fe0eb
--- a/modules/ocl/src/kmeans.cpp
+++ b/modules/ocl/src/kmeans.cpp
@@@ -160,32 -160,61 +160,61 @@@ static void generateCentersPP(const Mat
      }
  }
  
- void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers)
+ void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType, const oclMat &indices)
  {
-     //if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
-     //{
-     //    CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-     //    return;
-     //}
- 
-     Context  *clCxt = src.clCxt;
-     int labels_step = (int)(labels.step/labels.elemSize());
+     CV_Assert(src.cols*src.oclchannels() == centers.cols*centers.oclchannels());
+     CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F);
+     bool is_label_row_major = false;
+     ensureSizeIsEnough(1, src.rows, CV_32FC1, dists);
+     if(labels.empty() || (!labels.empty() && labels.rows == src.rows && labels.cols == 1))
+     {
+         ensureSizeIsEnough(src.rows, 1, CV_32SC1, labels);
+         is_label_row_major = true;
+     }
+     CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR);
+ 
+     std::stringstream build_opt_ss;
+     build_opt_ss
+         << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST")
+         << (indices.empty() ? "" : " -D USE_INDEX");
+ 
+     String build_opt = build_opt_ss.str();
+ 
+     const int src_step = (int)(src.oclchannels() * src.step / src.elemSize());
+     const int centers_step = (int)(centers.oclchannels() * centers.step / centers.elemSize());
+ 
+     const int colsNumb = centers.cols*centers.oclchannels();
+ 
+     const int label_step   = is_label_row_major ? (int)(labels.step / labels.elemSize()) : 1;
      String kernelname = "distanceToCenters";
-     int threadNum = src.rows > 256 ? 256 : src.rows;
-     size_t localThreads[3]  = {1, threadNum, 1};
-     size_t globalThreads[3] = {1, src.rows, 1};
+ 
+     const int number_of_input = indices.empty() ? src.rows : indices.size().area();
+ 
+     const int src_offset = (int)src.offset/src.elemSize();
+     const int centers_offset = (int)centers.offset/centers.elemSize();
+ 
+     size_t globalThreads[3] = {number_of_input, 1, 1};
  
 -    vector<pair<size_t, const void *> > args;
 -    args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
 -    args.push_back(make_pair(sizeof(cl_mem), (void *)&centers.data));
 +    std::vector<std::pair<size_t, const void *> > args;
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&labels_step));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.rows));
 +    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.cols));
-     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
 +    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&centers.data));
-     args.push_back(std::make_pair(sizeof(cl_mem), (void*)&dists.data));
+     if(!indices.empty())
+     {
 -        args.push_back(make_pair(sizeof(cl_mem), (void *)&indices.data));
++        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&indices.data));
+     }
 -    args.push_back(make_pair(sizeof(cl_mem), (void *)&labels.data));
 -    args.push_back(make_pair(sizeof(cl_mem), (void *)&dists.data));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&colsNumb));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&src_step));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&centers_step));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&label_step));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&number_of_input));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&centers.rows));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset));
 -    args.push_back(make_pair(sizeof(cl_int), (void *)&centers_offset));
++    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data));
++    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dists.data));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&colsNumb));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers_step));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&label_step));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&number_of_input));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.rows));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset));
++    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers_offset));
  
-     openCLExecuteKernel(clCxt, &kmeans_kernel, kernelname, globalThreads, localThreads, args, -1, -1, NULL);
+     openCLExecuteKernel(Context::getContext(), &kmeans_kernel,
+         kernelname, globalThreads, NULL, args, -1, -1, build_opt.c_str());
  }
  ///////////////////////////////////k - means /////////////////////////////////////////////////////////
  double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
diff --cc modules/ocl/src/moments.cpp
index 6372364,f11d381..0ba6e8c
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@@ -44,301 -44,344 +44,348 @@@
  //
  //M*/
  #include "precomp.hpp"
 +
 +#include "opencv2/imgproc/types_c.h"
 +#include "opencv2/imgproc/imgproc_c.h"
 +
  #include "opencl_kernels.hpp"
  
+ #if defined _MSC_VER
+ #define snprintf sprintf_s
+ #endif
  namespace cv
  {
- namespace ocl
- {
- // The function calculates center of gravity and the central second order moments
- static void icvCompleteMomentState( CvMoments* moments )
- {
-     double cx = 0, cy = 0;
-     double mu20, mu11, mu02;
- 
-     assert( moments != 0 );
-     moments->inv_sqrt_m00 = 0;
- 
-     if( fabs(moments->m00) > DBL_EPSILON )
-     {
-         double inv_m00 = 1. / moments->m00;
-         cx = moments->m10 * inv_m00;
-         cy = moments->m01 * inv_m00;
-         moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
-     }
- 
-     // mu20 = m20 - m10*cx
-     mu20 = moments->m20 - moments->m10 * cx;
-     // mu11 = m11 - m10*cy
-     mu11 = moments->m11 - moments->m10 * cy;
-     // mu02 = m02 - m01*cy
-     mu02 = moments->m02 - moments->m01 * cy;
- 
-     moments->mu20 = mu20;
-     moments->mu11 = mu11;
-     moments->mu02 = mu02;
- 
-     // mu30 = m30 - cx*(3*mu20 + cx*m10)
-     moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
-     mu11 += mu11;
-     // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
-     moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
-     // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
-     moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
-     // mu03 = m03 - cy*(3*mu02 + cy*m01)
-     moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
- }
- 
- 
- static void icvContourMoments( CvSeq* contour, CvMoments* mom )
- {
-     if( contour->total )
+     namespace ocl
      {
-         CvSeqReader reader;
-         int lpt = contour->total;
-         double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
- 
-         cvStartReadSeq( contour, &reader, 0 );
+         // The function calculates center of gravity and the central second order moments
+         static void icvCompleteMomentState( CvMoments* moments )
+         {
+             double cx = 0, cy = 0;
+             double mu20, mu11, mu02;
  
-         size_t reader_size = lpt << 1;
-         cv::Mat reader_mat(1,reader_size,CV_32FC1);
+             assert( moments != 0 );
+             moments->inv_sqrt_m00 = 0;
  
-         bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
+             if( fabs(moments->m00) > DBL_EPSILON )
+             {
+                 double inv_m00 = 1. / moments->m00;
+                 cx = moments->m10 * inv_m00;
+                 cy = moments->m01 * inv_m00;
+                 moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
+             }
  
-         if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
-         {
-             CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
+             // mu20 = m20 - m10*cx
+             mu20 = moments->m20 - moments->m10 * cx;
+             // mu11 = m11 - m10*cy
+             mu11 = moments->m11 - moments->m10 * cy;
+             // mu02 = m02 - m01*cy
+             mu02 = moments->m02 - moments->m01 * cy;
+ 
+             moments->mu20 = mu20;
+             moments->mu11 = mu11;
+             moments->mu02 = mu02;
+ 
+             // mu30 = m30 - cx*(3*mu20 + cx*m10)
+             moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
+             mu11 += mu11;
+             // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
+             moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
+             // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
+             moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
+             // mu03 = m03 - cy*(3*mu02 + cy*m01)
+             moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
          }
  
-         if( is_float )
+ 
+         static void icvContourMoments( CvSeq* contour, CvMoments* mom )
          {
-             for(size_t i = 0; i < reader_size; ++i)
+             if( contour->total )
              {
-                 reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
-                 reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
-                 CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                 CvSeqReader reader;
+                 int lpt = contour->total;
+                 double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
+ 
+                 cvStartReadSeq( contour, &reader, 0 );
+ 
+                 size_t reader_size = lpt << 1;
+                 cv::Mat reader_mat(1,reader_size,CV_32FC1);
+ 
+                 bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
+ 
+                 if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
+                 {
+                     CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
+                 }
+ 
+                 if( is_float )
+                 {
+                     for(size_t i = 0; i < reader_size; ++i)
+                     {
+                         reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
+                         reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
+                         CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                     }
+                 }
+                 else
+                 {
+                     for(size_t i = 0; i < reader_size; ++i)
+                     {
+                         reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
+                         reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
+                         CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                     }
+                 }
+ 
+                 cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
+                 cv::ocl::oclMat reader_oclmat(reader_mat);
+                 int llength = std::min(lpt,128);
+                 size_t localThreads[3]  = { llength, 1, 1};
+                 size_t globalThreads[3] = { lpt, 1, 1};
 -                vector<pair<size_t , const void *> > args;
 -                args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total ));
 -                args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
 -                args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
++                std::vector<std::pair<size_t , const void *> > args;
++                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
++                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
++                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
+                 cl_int dst_step = (cl_int)dst_a.step;
 -                args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step ));
++                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
+ 
+                 char builOption[128];
+                 snprintf(builOption, 128, "-D CV_8UC1");
+ 
+                 openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption);
+ 
+                 cv::Mat dst(dst_a);
+                 a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
+                 if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
+                 {
+                     for (int i = 0; i < contour->total; ++i)
+                     {
+                         a00 += dst.at<cl_long>(0, i);
+                         a10 += dst.at<cl_long>(1, i);
+                         a01 += dst.at<cl_long>(2, i);
+                         a20 += dst.at<cl_long>(3, i);
+                         a11 += dst.at<cl_long>(4, i);
+                         a02 += dst.at<cl_long>(5, i);
+                         a30 += dst.at<cl_long>(6, i);
+                         a21 += dst.at<cl_long>(7, i);
+                         a12 += dst.at<cl_long>(8, i);
+                         a03 += dst.at<cl_long>(9, i);
+                     }
+                 }
+                 else
+                 {
+                     a00 = cv::sum(dst.row(0))[0];
+                     a10 = cv::sum(dst.row(1))[0];
+                     a01 = cv::sum(dst.row(2))[0];
+                     a20 = cv::sum(dst.row(3))[0];
+                     a11 = cv::sum(dst.row(4))[0];
+                     a02 = cv::sum(dst.row(5))[0];
+                     a30 = cv::sum(dst.row(6))[0];
+                     a21 = cv::sum(dst.row(7))[0];
+                     a12 = cv::sum(dst.row(8))[0];
+                     a03 = cv::sum(dst.row(9))[0];
+                 }
+ 
+                 double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
+                 if( fabs(a00) > FLT_EPSILON )
+                 {
+                     if( a00 > 0 )
+                     {
+                         db1_2 = 0.5;
+                         db1_6 = 0.16666666666666666666666666666667;
+                         db1_12 = 0.083333333333333333333333333333333;
+                         db1_24 = 0.041666666666666666666666666666667;
+                         db1_20 = 0.05;
+                         db1_60 = 0.016666666666666666666666666666667;
+                     }
+                     else
+                     {
+                         db1_2 = -0.5;
+                         db1_6 = -0.16666666666666666666666666666667;
+                         db1_12 = -0.083333333333333333333333333333333;
+                         db1_24 = -0.041666666666666666666666666666667;
+                         db1_20 = -0.05;
+                         db1_60 = -0.016666666666666666666666666666667;
+                     }
+ 
+                     // spatial moments
+                     mom->m00 = a00 * db1_2;
+                     mom->m10 = a10 * db1_6;
+                     mom->m01 = a01 * db1_6;
+                     mom->m20 = a20 * db1_12;
+                     mom->m11 = a11 * db1_24;
+                     mom->m02 = a02 * db1_12;
+                     mom->m30 = a30 * db1_20;
+                     mom->m21 = a21 * db1_60;
+                     mom->m12 = a12 * db1_60;
+                     mom->m03 = a03 * db1_20;
+ 
+                     icvCompleteMomentState( mom );
+                 }
              }
          }
-         else
+ 
+         Moments ocl_moments(oclMat& src, bool binary) //for image
          {
-             for(size_t i = 0; i < reader_size; ++i)
+             CV_Assert(src.oclchannels() == 1);
+             if(src.type() == CV_64FC1 && !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
              {
-                 reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
-                 reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
-                 CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+                 CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
              }
-         }
  
-         cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
-         cv::ocl::oclMat reader_oclmat(reader_mat);
-         int llength = std::min(lpt,128);
-         size_t localThreads[3]  = { llength, 1, 1};
-         size_t globalThreads[3] = { lpt, 1, 1};
-         std::vector<std::pair<size_t , const void *> > args;
-         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
-         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
-         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
-         cl_int dst_step = (cl_int)dst_a.step;
-         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
- 
-         openCLExecuteKernel2(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
- 
-         cv::Mat dst(dst_a);
-         a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
-         if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-         {
-             for (int i = 0; i < contour->total; ++i)
+             if(binary)
              {
-                 a00 += dst.at<cl_long>(0, i);
-                 a10 += dst.at<cl_long>(1, i);
-                 a01 += dst.at<cl_long>(2, i);
-                 a20 += dst.at<cl_long>(3, i);
-                 a11 += dst.at<cl_long>(4, i);
-                 a02 += dst.at<cl_long>(5, i);
-                 a30 += dst.at<cl_long>(6, i);
-                 a21 += dst.at<cl_long>(7, i);
-                 a12 += dst.at<cl_long>(8, i);
-                 a03 += dst.at<cl_long>(9, i);
+                 oclMat mask;
+                 if(src.type() != CV_8UC1)
+                 {
+                     src.convertTo(mask, CV_8UC1);
+                 }
+                 oclMat src8u(src.size(), CV_8UC1);
+                 src8u.setTo(Scalar(255), mask);
+                 src = src8u;
              }
-         }
-         else
-         {
-             a00 = cv::sum(dst.row(0))[0];
-             a10 = cv::sum(dst.row(1))[0];
-             a01 = cv::sum(dst.row(2))[0];
-             a20 = cv::sum(dst.row(3))[0];
-             a11 = cv::sum(dst.row(4))[0];
-             a02 = cv::sum(dst.row(5))[0];
-             a30 = cv::sum(dst.row(6))[0];
-             a21 = cv::sum(dst.row(7))[0];
-             a12 = cv::sum(dst.row(8))[0];
-             a03 = cv::sum(dst.row(9))[0];
-         }
+             const int TILE_SIZE = 256;
  
-         double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
-         if( fabs(a00) > FLT_EPSILON )
-         {
-             if( a00 > 0 )
+             CvMoments mom;
+             memset(&mom, 0, sizeof(mom));
+ 
+             cv::Size size = src.size();
+             int blockx, blocky;
+             blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE;
+             blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE;
+ 
+             oclMat dst_m;
+             int tile_height = TILE_SIZE;
+ 
+             size_t localThreads[3]  = {1, tile_height, 1};
+             size_t globalThreads[3] = {blockx, size.height, 1};
+ 
+             if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
              {
-                 db1_2 = 0.5;
-                 db1_6 = 0.16666666666666666666666666666667;
-                 db1_12 = 0.083333333333333333333333333333333;
-                 db1_24 = 0.041666666666666666666666666666667;
-                 db1_20 = 0.05;
-                 db1_60 = 0.016666666666666666666666666666667;
+                 dst_m.create(blocky * 10, blockx, CV_64FC1);
+             }else
+             {
+                 dst_m.create(blocky * 10, blockx, CV_32FC1);
              }
+ 
+             int src_step = (int)(src.step/src.elemSize());
+             int dstm_step = (int)(dst_m.step/dst_m.elemSize());
+ 
 -            vector<pair<size_t , const void *> > args,args_sum;
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&dstm_step ));
++            std::vector<std::pair<size_t , const void *> > args,args_sum;
++            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step ));
++            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstm_step ));
+ 
+             int binary_;
+             if(binary)
+                 binary_ = 1;
              else
+                 binary_ = 0;
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&binary_));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary_));
+ 
+             char builOption[128];
+             if(binary || src.type() == CV_8UC1)
+             {
+                 snprintf(builOption, 128, "-D CV_8UC1");
+             }else if(src.type() == CV_16UC1)
              {
-                 db1_2 = -0.5;
-                 db1_6 = -0.16666666666666666666666666666667;
-                 db1_12 = -0.083333333333333333333333333333333;
-                 db1_24 = -0.041666666666666666666666666666667;
-                 db1_20 = -0.05;
-                 db1_60 = -0.016666666666666666666666666666667;
+                 snprintf(builOption, 128, "-D CV_16UC1");
+             }else if(src.type() == CV_16SC1)
+             {
+                 snprintf(builOption, 128, "-D CV_16SC1");
+             }else if(src.type() == CV_32FC1)
+             {
+                 snprintf(builOption, 128, "-D CV_32FC1");
+             }else if(src.type() == CV_64FC1)
+             {
+                 snprintf(builOption, 128, "-D CV_64FC1");
+             }else
+             {
+                 CV_Error( CV_StsUnsupportedFormat, "" );
              }
  
-             // spatial moments
-             mom->m00 = a00 * db1_2;
-             mom->m10 = a10 * db1_6;
-             mom->m01 = a01 * db1_6;
-             mom->m20 = a20 * db1_12;
-             mom->m11 = a11 * db1_24;
-             mom->m02 = a02 * db1_12;
-             mom->m30 = a30 * db1_20;
-             mom->m21 = a21 * db1_60;
-             mom->m12 = a12 * db1_60;
-             mom->m03 = a03 * db1_20;
- 
-             icvCompleteMomentState( mom );
-         }
-     }
- }
- 
- static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
- {
-     const int TILE_SIZE = 256;
-     int type, depth, cn, coi = 0;
-     CvMat stub, *mat = (CvMat*)array;
-     CvContour contourHeader;
-     CvSeq* contour = 0;
-     CvSeqBlock block;
-     if( CV_IS_SEQ( array ))
-     {
-         contour = (CvSeq*)array;
-         if( !CV_IS_SEQ_POINT_SET( contour ))
-             CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
-     }
+             openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption);
  
-     if( !mom )
-         CV_Error( CV_StsNullPtr, "" );
+             Mat tmp(dst_m);
+             tmp.convertTo(tmp, CV_64FC1);
  
-     memset( mom, 0, sizeof(*mom));
+             double tmp_m[10] = {0};
  
-     if( !contour )
-     {
+             for(int j = 0; j < tmp.rows; j += 10)
+             {
+                 for(int i = 0; i < tmp.cols; i++)
+                 {
+                     tmp_m[0] += tmp.at<double>(j, i);
+                     tmp_m[1] += tmp.at<double>(j + 1, i);
+                     tmp_m[2] += tmp.at<double>(j + 2, i);
+                     tmp_m[3] += tmp.at<double>(j + 3, i);
+                     tmp_m[4] += tmp.at<double>(j + 4, i);
+                     tmp_m[5] += tmp.at<double>(j + 5, i);
+                     tmp_m[6] += tmp.at<double>(j + 6, i);
+                     tmp_m[7] += tmp.at<double>(j + 7, i);
+                     tmp_m[8] += tmp.at<double>(j + 8, i);
+                     tmp_m[9] += tmp.at<double>(j + 9, i);
+                 }
+             }
  
-         mat = cvGetMat( mat, &stub, &coi );
-         type = CV_MAT_TYPE( mat->type );
+             mom.m00 = tmp_m[0];
+             mom.m10 = tmp_m[1];
+             mom.m01 = tmp_m[2];
+             mom.m20 = tmp_m[3];
+             mom.m11 = tmp_m[4];
+             mom.m02 = tmp_m[5];
+             mom.m30 = tmp_m[6];
+             mom.m21 = tmp_m[7];
+             mom.m12 = tmp_m[8];
+             mom.m03 = tmp_m[9];
+             icvCompleteMomentState( &mom );
+             return mom;
+         }
  
-         if( type == CV_32SC2 || type == CV_32FC2 )
+         Moments ocl_moments(InputArray _contour) //for contour
          {
-             contour = cvPointSeqFromMat(
-                           CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
-                           mat, &contourHeader, &block );
-         }
-     }
-     if( contour )
-     {
-         icvContourMoments( contour, mom );
-         return;
-     }
+             CvMoments mom;
+             memset(&mom, 0, sizeof(mom));
  
-     type = CV_MAT_TYPE( mat->type );
-     depth = CV_MAT_DEPTH( type );
-     cn = CV_MAT_CN( type );
- 
-     cv::Size size = cvGetMatSize( mat );
-     if( cn > 1 && coi == 0 )
-         CV_Error( CV_StsBadArg, "Invalid image type" );
- 
-     if( size.width <= 0 || size.height <= 0 )
-         return;
- 
-     cv::Mat src0 = cv::cvarrToMat(mat);
-     cv::ocl::oclMat src(src0);
-     cv::Size tileSize;
-     int blockx,blocky;
-     if(size.width%TILE_SIZE == 0)
-         blockx = size.width/TILE_SIZE;
-     else
-         blockx = size.width/TILE_SIZE + 1;
-     if(size.height%TILE_SIZE == 0)
-         blocky = size.height/TILE_SIZE;
-     else
-         blocky = size.height/TILE_SIZE + 1;
-     oclMat dst_m(blocky * 10, blockx, CV_64FC1);
-     oclMat sum(1, 10, CV_64FC1);
-     int tile_width  = std::min(size.width,TILE_SIZE);
-     int tile_height = std::min(size.height,TILE_SIZE);
-     size_t localThreads[3]  = { tile_height, 1, 1};
-     size_t globalThreads[3] = { size.height, blockx, 1};
-     std::vector<std::pair<size_t , const void *> > args,args_sum;
-     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
-     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&blocky ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-     openCLExecuteKernel2(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
- 
-     size_t localThreadss[3]  = { 128, 1, 1};
-     size_t globalThreadss[3] = { 128, 1, 1};
-     args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-     args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-     args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height ));
-     args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width ));
-     args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-     args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
-     args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
-     args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
-     openCLExecuteKernel2(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
- 
-     Mat dstsum(sum);
-     mom->m00 = dstsum.at<double>(0, 0);
-     mom->m10 = dstsum.at<double>(0, 1);
-     mom->m01 = dstsum.at<double>(0, 2);
-     mom->m20 = dstsum.at<double>(0, 3);
-     mom->m11 = dstsum.at<double>(0, 4);
-     mom->m02 = dstsum.at<double>(0, 5);
-     mom->m30 = dstsum.at<double>(0, 6);
-     mom->m21 = dstsum.at<double>(0, 7);
-     mom->m12 = dstsum.at<double>(0, 8);
-     mom->m03 = dstsum.at<double>(0, 9);
- 
-     icvCompleteMomentState( mom );
- }
+             Mat arr = _contour.getMat();
+             CvMat c_array = arr;
  
+             const void* array = &c_array;
  
- Moments ocl_moments( InputArray _array, bool binaryImage )
- {
-     CvMoments om;
-     Mat arr = _array.getMat();
-     CvMat c_array = arr;
-     ocl_cvMoments(&c_array, &om, binaryImage);
-     return om;
- }
+             CvSeq* contour = 0;
+             if( CV_IS_SEQ( array ))
+             {
+                 contour = (CvSeq*)(array);
+                 if( !CV_IS_SEQ_POINT_SET( contour ))
+                     CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
+             }
  
- }
+             int type, coi = 0;
+ 
+             CvMat stub, *mat = (CvMat*)(array);
+             CvContour contourHeader;
+             CvSeqBlock block;
+ 
+             if( !contour )
+             {
+                 mat = cvGetMat( mat, &stub, &coi );
+                 type = CV_MAT_TYPE( mat->type );
+ 
+                 if( type == CV_32SC2 || type == CV_32FC2 )
+                 {
+                     contour = cvPointSeqFromMat(
+                         CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
+                         mat, &contourHeader, &block );
+                 }
+             }
+ 
+             CV_Assert(contour);
  
+             icvContourMoments(contour, &mom);
+             return mom;
+         }
+     }
 -}
 +}
diff --cc modules/ocl/src/safe_call.hpp
index 6bc73ef,f772e1b..bd409c8
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@@ -65,8 -66,8 +65,8 @@@ namespace c
  
          static inline void ___openCLSafeCall(int err, const char *file, const int line, const char *func = "")
          {
-             if( CL_SUCCESS != err)
+             if (CL_SUCCESS != err)
 -                cv::ocl::error(getOpenCLErrorString(err), file, line, func);
 +                cv::error(Error::OpenCLApiCallError, getOpenCLErrorString(err), func, file, line);
          }
      }
  }
diff --cc modules/ocl/src/split_merge.cpp
index 990c91c,60a27a5..073a7a7
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@@@ -148,73 -149,112 +148,112 @@@ namespace c
                  mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
                  merge_vector_run(mat_src, n, mat_dst);
              }
-             static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
+             static void split_vector_run(const oclMat &src, oclMat *dst)
              {
  
-                 if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
+                 if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
                  {
 -                    CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
 +                    CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
                      return;
                  }
  
-                 Context  *clCxt = mat_src.clCxt;
-                 int channels = mat_src.oclchannels();
-                 int depth = mat_src.depth();
+                 Context  *clCtx = src.clCxt;
+                 int channels = src.channels();
+                 int depth = src.depth();
+                 depth = (depth == CV_8S) ? CV_8U : depth;
+                 depth = (depth == CV_16S) ? CV_16U : depth;
  
 -                string kernelName = "split_vector";
 +                String kernelName = "split_vector";
  
-                 int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
-                     {4, 4, 2, 2, 1, 1, 1},
-                     {4, 4, 2, 2 , 1, 1, 1},
-                     {4, 4, 2, 2, 1, 1, 1}
-                 };
- 
-                 size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()];
- 
-                 int max_offset_cols = 0;
-                 for(int i = 0; i < channels; i++)
-                 {
-                     int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1);
-                     if(max_offset_cols < offset_cols)
-                         max_offset_cols = offset_cols;
-                 }
- 
-                 int cols =  vector_length == 1 ? divUp(mat_src.cols, vector_length)
-                             : divUp(mat_src.cols + max_offset_cols, vector_length);
- 
-                 size_t localThreads[3]  = { 64, 4, 1 };
-                 size_t globalThreads[3] = { cols, mat_src.rows, 1 };
+                 size_t VEC_SIZE = 4;
  
-                 int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
 -                vector<pair<size_t , const void *> > args;
 -                args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 -                args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
 +                std::vector<std::pair<size_t , const void *> > args;
-                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.offset));
-                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset));
-                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset));
-                 if(channels >= 3)
++                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
++                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step));
+                 int srcOffsetXBytes = src.offset % src.step;
+                 int srcOffsetY = src.offset / src.step;
+                 cl_int2 srcOffset = {{srcOffsetXBytes, srcOffsetY}};
 -                args.push_back( make_pair( sizeof(cl_int2), (void *)&srcOffset));
++                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&srcOffset));
+ 
+                 bool dst0Aligned = false, dst1Aligned = false, dst2Aligned = false, dst3Aligned = false;
+                 int alignSize = dst[0].elemSize1() * VEC_SIZE;
+                 int alignMask = alignSize - 1;
+ 
 -                args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[0].data));
 -                args.push_back( make_pair( sizeof(cl_int), (void *)&dst[0].step));
++                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[0].data));
++                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[0].step));
+                 int dst0OffsetXBytes = dst[0].offset % dst[0].step;
+                 int dst0OffsetY = dst[0].offset / dst[0].step;
+                 cl_int2 dst0Offset = {{dst0OffsetXBytes, dst0OffsetY}};
 -                args.push_back( make_pair( sizeof(cl_int2), (void *)&dst0Offset));
++                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst0Offset));
+                 if ((dst0OffsetXBytes & alignMask) == 0)
+                     dst0Aligned = true;
+ 
 -                args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[1].data));
 -                args.push_back( make_pair( sizeof(cl_int), (void *)&dst[1].step));
++                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[1].data));
++                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[1].step));
+                 int dst1OffsetXBytes = dst[1].offset % dst[1].step;
+                 int dst1OffsetY = dst[1].offset / dst[1].step;
+                 cl_int2 dst1Offset = {{dst1OffsetXBytes, dst1OffsetY}};
 -                args.push_back( make_pair( sizeof(cl_int2), (void *)&dst1Offset));
++                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst1Offset));
+                 if ((dst1OffsetXBytes & alignMask) == 0)
+                     dst1Aligned = true;
+ 
+                 // DON'T MOVE VARIABLES INTO 'IF' BODY
+                 int dst2OffsetXBytes, dst2OffsetY;
+                 cl_int2 dst2Offset;
+                 int dst3OffsetXBytes, dst3OffsetY;
+                 cl_int2 dst3Offset;
+                 if (channels >= 3)
                  {
- 
-                     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
-                     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
-                     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset));
 -                    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[2].data));
 -                    args.push_back( make_pair( sizeof(cl_int), (void *)&dst[2].step));
++                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[2].data));
++                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[2].step));
+                     dst2OffsetXBytes = dst[2].offset % dst[2].step;
+                     dst2OffsetY = dst[2].offset / dst[2].step;
+                     dst2Offset.s[0] = dst2OffsetXBytes; dst2Offset.s[1] = dst2OffsetY;
 -                    args.push_back( make_pair( sizeof(cl_int2), (void *)&dst2Offset));
++                    args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst2Offset));
+                     if ((dst2OffsetXBytes & alignMask) == 0)
+                         dst2Aligned = true;
                  }
-                 if(channels >= 4)
+ 
+                 if (channels >= 4)
                  {
-                     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
-                     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
-                     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset));
 -                    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[3].data));
 -                    args.push_back( make_pair( sizeof(cl_int), (void *)&dst[3].step));
++                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[3].data));
++                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[3].step));
+                     dst3OffsetXBytes = dst[3].offset % dst[3].step;
+                     dst3OffsetY = dst[3].offset / dst[3].step;
+                     dst3Offset.s[0] = dst3OffsetXBytes; dst3Offset.s[1] = dst3OffsetY;
 -                    args.push_back( make_pair( sizeof(cl_int2), (void *)&dst3Offset));
++                    args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst3Offset));
+                     if ((dst3OffsetXBytes & alignMask) == 0)
+                         dst3Aligned = true;
                  }
  
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1));
- 
-                 openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
+                 cl_int2 size = {{ src.cols, src.rows }};
 -                args.push_back( make_pair( sizeof(cl_int2), (void *)&size));
++                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&size));
+ 
 -                string build_options =
++                String build_options =
+                         cv::format("-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d",
+                                    (int)VEC_SIZE, depth, channels);
+ 
+                 if (dst0Aligned)
 -                    build_options += " -D DST0_ALIGNED";
++                    build_options = build_options + " -D DST0_ALIGNED";
+                 if (dst1Aligned)
 -                    build_options += " -D DST1_ALIGNED";
++                    build_options = build_options + " -D DST1_ALIGNED";
+                 if (dst2Aligned)
 -                    build_options += " -D DST2_ALIGNED";
++                    build_options = build_options + " -D DST2_ALIGNED";
+                 if (dst3Aligned)
 -                    build_options += " -D DST3_ALIGNED";
++                    build_options = build_options + " -D DST3_ALIGNED";
+ 
+                 const DeviceInfo& devInfo = clCtx->getDeviceInfo();
+ 
+                 // TODO Workaround for issues. Need to investigate a problem.
+                 if (channels == 2
+                         && devInfo.deviceType == CVCL_DEVICE_TYPE_CPU
+                         && devInfo.platform->platformVendor.find("Intel") != std::string::npos
+                         && (devInfo.deviceVersion.find("Build 56860") != std::string::npos
+                             || devInfo.deviceVersion.find("Build 76921") != std::string::npos))
 -                    build_options += " -D BYPASS_VSTORE=true";
++                    build_options = build_options + " -D BYPASS_VSTORE=true";
+ 
+                 size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 };
+                 openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str());
              }
              static void split(const oclMat &mat_src, oclMat *mat_dst)
              {
@@@ -253,9 -292,9 +291,9 @@@ void cv::ocl::split(const oclMat &src, 
  {
      split_merge::split(src, dst);
  }
 -void cv::ocl::split(const oclMat &src, vector<oclMat> &dst)
 +void cv::ocl::split(const oclMat &src, std::vector<oclMat> &dst)
  {
-     dst.resize(src.oclchannels());
+     dst.resize(src.oclchannels()); // TODO Why oclchannels?
      if(src.oclchannels() > 0)
          split_merge::split(src, &dst[0]);
  }
diff --cc modules/ocl/test/test_kmeans.cpp
index 94263d8,6539c51..d583cc9
--- a/modules/ocl/test/test_kmeans.cpp
+++ b/modules/ocl/test/test_kmeans.cpp
@@@ -114,13 -113,11 +113,11 @@@ OCL_TEST_P(Kmeans, Mat)
      for(int j = 0; j < LOOP_TIMES; j++)
      {
          kmeans(src, K, labels,
 -            TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 100, 0),
 +            TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
              1, flags, centers);
- 
          ocl::kmeans(d_src, K, d_labels,
 -            TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 100, 0),
 +            TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
              1, flags, d_centers);
- 
          Mat dd_labels(d_labels);
          Mat dd_centers(d_centers);
          if(flags & KMEANS_USE_INITIAL_LABELS)
diff --cc modules/ocl/test/test_moments.cpp
index 7118609,788ac91..e978bb2
--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
@@@ -7,35 -8,35 +7,33 @@@ using namespace cv
  using namespace cv::ocl;
  using namespace cvtest;
  using namespace testing;
- PARAM_TEST_CASE(MomentsTest, MatType, bool)
 -using namespace std;
 -
+ PARAM_TEST_CASE(MomentsTest, MatType, bool, bool)
  {
      int type;
-     cv::Mat mat1;
+     cv::Mat mat;
      bool test_contours;
- 
+     bool binaryImage;
      virtual void SetUp()
      {
          type = GET_PARAM(0);
          test_contours = GET_PARAM(1);
-         cv::Size size(10*MWIDTH, 10*MHEIGHT);
-         mat1 = randomMat(size, type, 5, 16, false);
+         cv::Size size(10 * MWIDTH, 10 * MHEIGHT);
+         mat = randomMat(size, type, 0, 256, false);
+         binaryImage = GET_PARAM(2);
      }
  
 -    void Compare(Moments& cpu, Moments& gpu)
 +    void Compare(Moments& cpu_moments, Moments& gpu_moments)
      {
          Mat gpu_dst, cpu_dst;
 -        HuMoments(cpu, cpu_dst);
 -        HuMoments(gpu, gpu_dst);
 -        EXPECT_MAT_NEAR(gpu_dst,cpu_dst, 1e-3);
 +        HuMoments(cpu_moments, cpu_dst);
 +        HuMoments(gpu_moments, gpu_dst);
-         EXPECT_MAT_NEAR(gpu_dst, cpu_dst, .5);
++        EXPECT_MAT_NEAR(gpu_dst, cpu_dst, 1e-3);
      }
- 
  };
  
- 
  OCL_TEST_P(MomentsTest, Mat)
  {
-     bool binaryImage = 0;
- 
+     oclMat src_d(mat);
      for(int j = 0; j < LOOP_TIMES; j++)
      {
          if(test_contours)
@@@ -62,6 -62,5 +59,6 @@@
      }
  }
  INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine(
-                             Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false)));
+     Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1, CV_64FC1), Values(false, true), Values(false, true)));
 +
  #endif // HAVE_OPENCL
diff --cc modules/superres/perf/perf_superres_ocl.cpp
index 67bcf8c,9a8fab4..04a3f7e
--- a/modules/superres/perf/perf_superres_ocl.cpp
+++ b/modules/superres/perf/perf_superres_ocl.cpp
@@@ -42,9 -42,9 +42,9 @@@
  
  #include "perf_precomp.hpp"
  
- #ifdef HAVE_OPENCL
+ #ifdef HAVE_OPENCV_OCL
  
 -#include "opencv2/ocl/ocl.hpp"
 +#include "opencv2/ocl.hpp"
  using namespace std;
  using namespace testing;
  using namespace perf;
diff --cc samples/gpu/CMakeLists.txt
index 2591d32,732a917..64c25fc
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@@ -56,11 -48,8 +56,11 @@@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_
      if(HAVE_opencv_nonfree)
        target_link_libraries(${the_target} opencv_nonfree)
      endif()
 +    if(HAVE_opencv_cudacodec)
 +      target_link_libraries(${the_target} opencv_cudacodec)
 +    endif()
  
-     if(HAVE_OPENCL)
+     if(HAVE_opencv_ocl)
        target_link_libraries(${the_target} opencv_ocl)
      endif()