From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Tue, 19 Nov 2013 12:21:09 +0000 (+0400)
Subject: Merge remote-tracking branch 'origin/2.4' into merge-2.4
X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~853^2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=aacf188e837324b5e35dac9d2accaa332a83d346;p=profile%2Fivi%2Fopencv.git

Merge remote-tracking branch 'origin/2.4' into merge-2.4

Conflicts:
	modules/ocl/include/opencv2/ocl/ocl.hpp
	modules/ocl/src/arithm.cpp
	modules/ocl/src/build_warps.cpp
	modules/ocl/src/color.cpp
	modules/ocl/src/haar.cpp
	modules/ocl/src/imgproc.cpp
	modules/ocl/src/split_merge.cpp
	modules/ocl/test/test_color.cpp
	samples/cpp/3calibration.cpp
	samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
	samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp
	samples/cpp/Qt_sample/main.cpp
	samples/cpp/camshiftdemo.cpp
	samples/cpp/descriptor_extractor_matcher.cpp
	samples/cpp/distrans.cpp
	samples/cpp/generic_descriptor_match.cpp
	samples/cpp/grabcut.cpp
	samples/cpp/morphology2.cpp
	samples/cpp/segment_objects.cpp
	samples/cpp/stereo_calib.cpp
	samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
	samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp
	samples/cpp/tutorial_code/introduction/display_image/display_image.cpp
	samples/cpp/tutorial_code/introduction/windows_visual_studio_Opencv/Test.cpp
	samples/cpp/tutorial_code/objectDetection/objectDetection.cpp
	samples/cpp/tutorial_code/objectDetection/objectDetection2.cpp
	samples/cpp/video_dmtx.cpp
---

aacf188e837324b5e35dac9d2accaa332a83d346
diff --cc modules/ocl/include/opencv2/ocl.hpp
index 925d612,0000000..19af00b
mode 100644,000000..100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@@ -1,2077 -1,0 +1,2077 @@@
 +/*M///////////////////////////////////////////////////////////////////////////////////////
 +//
 +//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 +//
 +//  By downloading, copying, installing or using the software you agree to this license.
 +//  If you do not agree to this license, do not download, install,
 +//  copy or use the software.
 +//
 +//
 +//                           License Agreement
 +//                For Open Source Computer Vision Library
 +//
 +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 +// Third party copyrights are property of their respective owners.
 +//
 +// Redistribution and use in source and binary forms, with or without modification,
 +// are permitted provided that the following conditions are met:
 +//
 +//   * Redistribution's of source code must retain the above copyright notice,
 +//     this list of conditions and the following disclaimer.
 +//
 +//   * Redistribution's in binary form must reproduce the above copyright notice,
 +//     this list of conditions and the following disclaimer in the documentation
 +//     and/or other materials provided with the distribution.
 +//
 +//   * The name of the copyright holders may not be used to endorse or promote products
 +//     derived from this software without specific prior written permission.
 +//
 +// This software is provided by the copyright holders and contributors "as is" and
 +// any express or implied warranties, including, but not limited to, the implied
 +// warranties of merchantability and fitness for a particular purpose are disclaimed.
 +// In no event shall the Intel Corporation or contributors be liable for any direct,
 +// indirect, incidental, special, exemplary, or consequential damages
 +// (including, but not limited to, procurement of substitute goods or services;
 +// loss of use, data, or profits; or business interruption) however caused
 +// and on any theory of liability, whether in contract, strict liability,
 +// or tort (including negligence or otherwise) arising in any way out of
 +// the use of this software, even if advised of the possibility of such damage.
 +//
 +//M*/
 +
 +#ifndef __OPENCV_OCL_HPP__
 +#define __OPENCV_OCL_HPP__
 +
 +#include <memory>
 +#include <vector>
 +
 +#include "opencv2/core.hpp"
 +#include "opencv2/imgproc.hpp"
 +#include "opencv2/objdetect.hpp"
 +#include "opencv2/ml.hpp"
 +
 +namespace cv
 +{
 +    namespace ocl
 +    {
 +        enum DeviceType
 +        {
 +            CVCL_DEVICE_TYPE_DEFAULT     = (1 << 0),
 +            CVCL_DEVICE_TYPE_CPU         = (1 << 1),
 +            CVCL_DEVICE_TYPE_GPU         = (1 << 2),
 +            CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3),
 +            //CVCL_DEVICE_TYPE_CUSTOM      = (1 << 4)
 +            CVCL_DEVICE_TYPE_ALL         = 0xFFFFFFFF
 +        };
 +
 +        enum DevMemRW
 +        {
 +            DEVICE_MEM_R_W = 0,
 +            DEVICE_MEM_R_ONLY,
 +            DEVICE_MEM_W_ONLY
 +        };
 +
 +        enum DevMemType
 +        {
 +            DEVICE_MEM_DEFAULT = 0,
 +            DEVICE_MEM_AHP,         //alloc host pointer
 +            DEVICE_MEM_UHP,         //use host pointer
 +            DEVICE_MEM_CHP,         //copy host pointer
 +            DEVICE_MEM_PM           //persistent memory
 +        };
 +
 +        // these classes contain OpenCL runtime information
 +
 +        struct PlatformInfo;
 +
 +        struct DeviceInfo
 +        {
 +        public:
 +            int _id; // reserved, don't use it
 +
 +            DeviceType deviceType;
 +            std::string deviceProfile;
 +            std::string deviceVersion;
 +            std::string deviceName;
 +            std::string deviceVendor;
 +            int deviceVendorId;
 +            std::string deviceDriverVersion;
 +            std::string deviceExtensions;
 +
 +            size_t maxWorkGroupSize;
 +            std::vector<size_t> maxWorkItemSizes;
 +            int maxComputeUnits;
 +            size_t localMemorySize;
 +            size_t maxMemAllocSize;
 +
 +            int deviceVersionMajor;
 +            int deviceVersionMinor;
 +
 +            bool haveDoubleSupport;
 +            bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
 +            bool isIntelDevice;
 +
 +            std::string compilationExtraOptions;
 +
 +            const PlatformInfo* platform;
 +
 +            DeviceInfo();
 +        };
 +
 +        struct PlatformInfo
 +        {
 +            int _id; // reserved, don't use it
 +
 +            std::string platformProfile;
 +            std::string platformVersion;
 +            std::string platformName;
 +            std::string platformVendor;
 +            std::string platformExtensons;
 +
 +            int platformVersionMajor;
 +            int platformVersionMinor;
 +
 +            std::vector<const DeviceInfo*> devices;
 +
 +            PlatformInfo();
 +        };
 +
 +        //////////////////////////////// Initialization & Info ////////////////////////
 +        typedef std::vector<const PlatformInfo*> PlatformsInfo;
 +
 +        CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms);
 +
 +        typedef std::vector<const DeviceInfo*> DevicesInfo;
 +
 +        CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU,
 +                const PlatformInfo* platform = NULL);
 +
 +        // set device you want to use
 +        CV_EXPORTS void setDevice(const DeviceInfo* info);
 +
 +        enum FEATURE_TYPE
 +        {
 +            FEATURE_CL_DOUBLE = 1,
 +            FEATURE_CL_UNIFIED_MEM,
 +            FEATURE_CL_VER_1_2,
 +            FEATURE_CL_INTEL_DEVICE
 +        };
 +
 +        // Represents OpenCL context, interface
 +        class CV_EXPORTS Context
 +        {
 +        protected:
 +            Context() { }
 +            ~Context() { }
 +        public:
 +            static Context *getContext();
 +
 +            bool supportsFeature(FEATURE_TYPE featureType) const;
 +            const DeviceInfo& getDeviceInfo() const;
 +
 +            const void* getOpenCLContextPtr() const;
 +            const void* getOpenCLCommandQueuePtr() const;
 +            const void* getOpenCLDeviceIDPtr() const;
 +        };
 +
 +        inline const void *getClContextPtr()
 +        {
 +            return Context::getContext()->getOpenCLContextPtr();
 +        }
 +
 +        inline const void *getClCommandQueuePtr()
 +        {
 +            return Context::getContext()->getOpenCLCommandQueuePtr();
 +        }
 +
 +        CV_EXPORTS bool supportsFeature(FEATURE_TYPE featureType);
 +
 +        CV_EXPORTS void finish();
 +
 +        enum BINARY_CACHE_MODE
 +        {
 +            CACHE_NONE    = 0,        // do not cache OpenCL binary
 +            CACHE_DEBUG   = 0x1 << 0, // cache OpenCL binary when built in debug mode
 +            CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode
 +            CACHE_ALL     = CACHE_DEBUG | CACHE_RELEASE, // cache opencl binary
 +        };
 +        //! Enable or disable OpenCL program binary caching onto local disk
 +        // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the
 +        // compiled OpenCL program to be cached to the path automatically as "path/*.clb"
 +        // binary file, which will be reused when the OpenCV executable is started again.
 +        //
 +        // This feature is enabled by default.
 +        CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./");
 +
 +        //! set where binary cache to be saved to
 +        CV_EXPORTS void setBinaryPath(const char *path);
 +
 +        struct ProgramSource
 +        {
 +            const char* name;
 +            const char* programStr;
 +            const char* programHash;
 +
 +            // Cache in memory by name (should be unique). Caching on disk disabled.
 +            inline ProgramSource(const char* _name, const char* _programStr)
 +                : name(_name), programStr(_programStr), programHash(NULL)
 +            {
 +            }
 +
 +            // Cache in memory by name (should be unique). Caching on disk uses programHash mark.
 +            inline ProgramSource(const char* _name, const char* _programStr, const char* _programHash)
 +                : name(_name), programStr(_programStr), programHash(_programHash)
 +            {
 +            }
 +        };
 +
 +        //! Calls OpenCL kernel. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
 +        //! Deprecated, will be replaced
 +        CV_EXPORTS void openCLExecuteKernelInterop(Context *clCxt,
 +                const cv::ocl::ProgramSource& source, String kernelName,
 +                size_t globalThreads[3], size_t localThreads[3],
 +                std::vector< std::pair<size_t, const void *> > &args,
 +                int channels, int depth, const char *build_options);
 +
 +        class CV_EXPORTS oclMatExpr;
 +        //////////////////////////////// oclMat ////////////////////////////////
 +        class CV_EXPORTS oclMat
 +        {
 +        public:
 +            //! default constructor
 +            oclMat();
 +            //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
 +            oclMat(int rows, int cols, int type);
 +            oclMat(Size size, int type);
 +            //! constucts oclMatrix and fills it with the specified value _s.
 +            oclMat(int rows, int cols, int type, const Scalar &s);
 +            oclMat(Size size, int type, const Scalar &s);
 +            //! copy constructor
 +            oclMat(const oclMat &m);
 +
 +            //! constructor for oclMatrix headers pointing to user-allocated data
 +            oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP);
 +            oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP);
 +
 +            //! creates a matrix header for a part of the bigger matrix
 +            oclMat(const oclMat &m, const Range &rowRange, const Range &colRange);
 +            oclMat(const oclMat &m, const Rect &roi);
 +
 +            //! builds oclMat from Mat. Perfom blocking upload to device.
 +            explicit oclMat (const Mat &m);
 +
 +            //! destructor - calls release()
 +            ~oclMat();
 +
 +            //! assignment operators
 +            oclMat &operator = (const oclMat &m);
 +            //! assignment operator. Perfom blocking upload to device.
 +            oclMat &operator = (const Mat &m);
 +            oclMat &operator = (const oclMatExpr& expr);
 +
 +            //! pefroms blocking upload data to oclMat.
 +            void upload(const cv::Mat &m);
 +
 +
 +            //! downloads data from device to host memory. Blocking calls.
 +            operator Mat() const;
 +            void download(cv::Mat &m) const;
 +
 +            //! convert to _InputArray
 +            operator _InputArray();
 +
 +            //! convert to _OutputArray
 +            operator _OutputArray();
 +
 +            //! returns a new oclMatrix header for the specified row
 +            oclMat row(int y) const;
 +            //! returns a new oclMatrix header for the specified column
 +            oclMat col(int x) const;
 +            //! ... for the specified row span
 +            oclMat rowRange(int startrow, int endrow) const;
 +            oclMat rowRange(const Range &r) const;
 +            //! ... for the specified column span
 +            oclMat colRange(int startcol, int endcol) const;
 +            oclMat colRange(const Range &r) const;
 +
 +            //! returns deep copy of the oclMatrix, i.e. the data is copied
 +            oclMat clone() const;
 +
 +            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
 +            // It calls m.create(this->size(), this->type()).
 +            // It supports any data type
 +            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
 +
 +            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
 +            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
 +
 +            void assignTo( oclMat &m, int type = -1 ) const;
 +
 +            //! sets every oclMatrix element to s
 +            oclMat& operator = (const Scalar &s);
 +            //! sets some of the oclMatrix elements to s, according to the mask
 +            oclMat& setTo(const Scalar &s, const oclMat &mask = oclMat());
 +            //! creates alternative oclMatrix header for the same data, with different
 +            // number of channels and/or different number of rows. see cvReshape.
 +            oclMat reshape(int cn, int rows = 0) const;
 +
 +            //! allocates new oclMatrix data unless the oclMatrix already has specified size and type.
 +            // previous data is unreferenced if needed.
 +            void create(int rows, int cols, int type);
 +            void create(Size size, int type);
 +
 +            //! allocates new oclMatrix with specified device memory type.
 +            void createEx(int rows, int cols, int type,
 +                          DevMemRW rw_type, DevMemType mem_type);
 +            void createEx(Size size, int type, DevMemRW rw_type,
 +                          DevMemType mem_type);
 +
 +            //! decreases reference counter;
 +            // deallocate the data when reference counter reaches 0.
 +            void release();
 +
 +            //! swaps with other smart pointer
 +            void swap(oclMat &mat);
 +
 +            //! locates oclMatrix header within a parent oclMatrix. See below
 +            void locateROI( Size &wholeSize, Point &ofs ) const;
 +            //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix.
 +            oclMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
 +            //! extracts a rectangular sub-oclMatrix
 +            // (this is a generalized form of row, rowRange etc.)
 +            oclMat operator()( Range rowRange, Range colRange ) const;
 +            oclMat operator()( const Rect &roi ) const;
 +
 +            oclMat& operator+=( const oclMat& m );
 +            oclMat& operator-=( const oclMat& m );
 +            oclMat& operator*=( const oclMat& m );
 +            oclMat& operator/=( const oclMat& m );
 +
 +            //! returns true if the oclMatrix data is continuous
 +            // (i.e. when there are no gaps between successive rows).
 +            // similar to CV_IS_oclMat_CONT(cvoclMat->type)
 +            bool isContinuous() const;
 +            //! returns element size in bytes,
 +            // similar to CV_ELEM_SIZE(cvMat->type)
 +            size_t elemSize() const;
 +            //! returns the size of element channel in bytes.
 +            size_t elemSize1() const;
 +            //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
 +            int type() const;
 +            //! returns element type, i.e. 8UC3 returns 8UC4 because in ocl
 +            //! 3 channels element actually use 4 channel space
 +            int ocltype() const;
 +            //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
 +            int depth() const;
 +            //! returns element type, similar to CV_MAT_CN(cvMat->type)
 +            int channels() const;
 +            //! returns element type, return 4 for 3 channels element,
 +            //!becuase 3 channels element actually use 4 channel space
 +            int oclchannels() const;
 +            //! returns step/elemSize1()
 +            size_t step1() const;
 +            //! returns oclMatrix size:
 +            // width == number of columns, height == number of rows
 +            Size size() const;
 +            //! returns true if oclMatrix data is NULL
 +            bool empty() const;
 +
 +            //! returns pointer to y-th row
 +            uchar* ptr(int y = 0);
 +            const uchar *ptr(int y = 0) const;
 +
 +            //! template version of the above method
 +            template<typename _Tp> _Tp *ptr(int y = 0);
 +            template<typename _Tp> const _Tp *ptr(int y = 0) const;
 +
 +            //! matrix transposition
 +            oclMat t() const;
 +
 +            /*! includes several bit-fields:
 +              - the magic signature
 +              - continuity flag
 +              - depth
 +              - number of channels
 +              */
 +            int flags;
 +            //! the number of rows and columns
 +            int rows, cols;
 +            //! a distance between successive rows in bytes; includes the gap if any
 +            size_t step;
 +            //! pointer to the data(OCL memory object)
 +            uchar *data;
 +
 +            //! pointer to the reference counter;
 +            // when oclMatrix points to user-allocated data, the pointer is NULL
 +            int *refcount;
 +
 +            //! helper fields used in locateROI and adjustROI
 +            //datastart and dataend are not used in current version
 +            uchar *datastart;
 +            uchar *dataend;
 +
 +            //! OpenCL context associated with the oclMat object.
 +            Context *clCxt; // TODO clCtx
 +            //add offset for handle ROI, calculated in byte
 +            int offset;
 +            //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
 +            int wholerows;
 +            int wholecols;
 +        };
 +
 +        // convert InputArray/OutputArray to oclMat references
 +        CV_EXPORTS oclMat& getOclMatRef(InputArray src);
 +        CV_EXPORTS oclMat& getOclMatRef(OutputArray src);
 +
 +        ///////////////////// mat split and merge /////////////////////////////////
 +        //! Compose a multi-channel array from several single-channel arrays
 +        // Support all types
 +        CV_EXPORTS void merge(const oclMat *src, size_t n, oclMat &dst);
 +        CV_EXPORTS void merge(const std::vector<oclMat> &src, oclMat &dst);
 +
 +        //! Divides multi-channel array into several single-channel arrays
 +        // Support all types
 +        CV_EXPORTS void split(const oclMat &src, oclMat *dst);
 +        CV_EXPORTS void split(const oclMat &src, std::vector<oclMat> &dst);
 +
 +        ////////////////////////////// Arithmetics ///////////////////////////////////
 +
 +        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
 +        // supports all data types
 +        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
 +
 +        //! adds one matrix to another (dst = src1 + src2)
 +        // supports all data types
 +        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        //! adds scalar to a matrix (dst = src1 + s)
 +        // supports all data types
 +        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! subtracts one matrix from another (dst = src1 - src2)
 +        // supports all data types
 +        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        //! subtracts scalar from a matrix (dst = src1 - s)
 +        // supports all data types
 +        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
 +        // supports all data types
 +        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
 +        //! multiplies matrix to a number (dst = scalar * src)
 +        // supports all data types
 +        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
 +
 +        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
 +        // supports all data types
 +        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
 +        //! computes element-wise quotient of the two arrays (dst = scale / src)
 +        // supports all data types
 +        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
 +
 +        //! computes element-wise minimum of the two arrays (dst = min(src1, src2))
 +        // supports all data types
 +        CV_EXPORTS void min(const oclMat &src1, const oclMat &src2, oclMat &dst);
 +
 +        //! computes element-wise maximum of the two arrays (dst = max(src1, src2))
 +        // supports all data types
 +        CV_EXPORTS void max(const oclMat &src1, const oclMat &src2, oclMat &dst);
 +
 +        //! compares elements of two arrays (dst = src1 <cmpop> src2)
 +        // supports all data types
 +        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);
 +
 +        //! transposes the matrix
 +        // supports all data types
 +        CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);
 +
 +        //! computes element-wise absolute values of an array (dst = abs(src))
 +        // supports all data types
 +        CV_EXPORTS void abs(const oclMat &src, oclMat &dst);
 +
 +        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
 +        // supports all data types
 +        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
 +        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
 +        // supports all data types
 +        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);
 +
 +        //! computes mean value and standard deviation of all or selected array elements
 +        // supports all data types
 +        CV_EXPORTS void meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev);
 +
 +        //! computes norm of array
 +        // supports NORM_INF, NORM_L1, NORM_L2
 +        // supports all data types
 +        CV_EXPORTS double norm(const oclMat &src1, int normType = NORM_L2);
 +
 +        //! computes norm of the difference between two arrays
 +        // supports NORM_INF, NORM_L1, NORM_L2
 +        // supports all data types
 +        CV_EXPORTS double norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2);
 +
 +        //! reverses the order of the rows, columns or both in a matrix
 +        // supports all types
 +        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);
 +
 +        //! computes sum of array elements
 +        // support all types
 +        CV_EXPORTS Scalar sum(const oclMat &m);
 +        CV_EXPORTS Scalar absSum(const oclMat &m);
 +        CV_EXPORTS Scalar sqrSum(const oclMat &m);
 +
 +        //! finds global minimum and maximum array elements and returns their values
 +        // support all C1 types
 +        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
 +
 +        //! finds global minimum and maximum array elements and returns their values with locations
 +        // support all C1 types
 +        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
 +                                  const oclMat &mask = oclMat());
 +
 +        //! counts non-zero array elements
 +        // support all types
 +        CV_EXPORTS int countNonZero(const oclMat &src);
 +
 +        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
 +        // destination array will have the depth type as lut and the same channels number as source
 +        //It supports 8UC1 8UC4 only
 +        CV_EXPORTS void LUT(const oclMat &src, const oclMat &lut, oclMat &dst);
 +
 +        //! only 8UC1 and 256 bins is supported now
 +        CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
 +        //! only 8UC1 and 256 bins is supported now
 +        CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
 +
 +        //! only 8UC1 is supported now
 +        CV_EXPORTS Ptr<cv::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
 +
 +        //! bilateralFilter
 +        // supports 8UC1 8UC4
 +        CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT);
 +
 +        //! Applies an adaptive bilateral filter to the input image
 +        //  Unlike the usual bilateral filter that uses fixed value for sigmaColor,
 +        //  the adaptive version calculates the local variance in he ksize neighborhood
 +        //  and use this as sigmaColor, for the value filtering. However, the local standard deviation is
 +        //  clamped to the maxSigmaColor.
 +        //  supports 8UC1, 8UC3
 +        CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, double maxSigmaColor=20.0, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);
 +
 +        //! computes exponent of each matrix element (dst = e**src)
 +        // supports only CV_32FC1, CV_64FC1 type
 +        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);
 +
 +        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
 +        // supports only CV_32FC1, CV_64FC1 type
 +        CV_EXPORTS void log(const oclMat &src, oclMat &dst);
 +
 +        //! computes magnitude of each (x(i), y(i)) vector
 +        // supports only CV_32F, CV_64F type
 +        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
 +
 +        //! computes angle (angle(i)) of each (x(i), y(i)) vector
 +        // supports only CV_32F, CV_64F type
 +        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
 +
 +        //! the function raises every element of tne input array to p
 +        // support only CV_32F, CV_64F type
 +        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
 +
 +        //! converts Cartesian coordinates to polar
 +        // supports only CV_32F CV_64F type
 +        CV_EXPORTS void cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false);
 +
 +        //! converts polar coordinates to Cartesian
 +        // supports only CV_32F CV_64F type
 +        CV_EXPORTS void polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false);
 +
 +        //! perfroms per-elements bit-wise inversion
 +        // supports all types
 +        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
 +
 +        //! calculates per-element bit-wise disjunction of two arrays
 +        // supports all types
 +        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! calculates per-element bit-wise conjunction of two arrays
 +        // supports all types
 +        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! calculates per-element bit-wise "exclusive or" operation
 +        // supports all types
 +        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
 +        CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
 +
 +        //! Logical operators
 +        CV_EXPORTS oclMat operator ~ (const oclMat &);
 +        CV_EXPORTS oclMat operator | (const oclMat &, const oclMat &);
 +        CV_EXPORTS oclMat operator & (const oclMat &, const oclMat &);
 +        CV_EXPORTS oclMat operator ^ (const oclMat &, const oclMat &);
 +
 +
 +        //! Mathematics operators
 +        CV_EXPORTS oclMatExpr operator + (const oclMat &src1, const oclMat &src2);
 +        CV_EXPORTS oclMatExpr operator - (const oclMat &src1, const oclMat &src2);
 +        CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
 +        CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
 +
 +        struct CV_EXPORTS ConvolveBuf
 +        {
 +            Size result_size;
 +            Size block_size;
 +            Size user_block_size;
 +            Size dft_size;
 +
 +            oclMat image_spect, templ_spect, result_spect;
 +            oclMat image_block, templ_block, result_data;
 +
 +            void create(Size image_size, Size templ_size);
 +            static Size estimateBlockSize(Size result_size, Size templ_size);
 +        };
 +
 +        //! computes convolution of two images, may use discrete Fourier transform
 +        // support only CV_32FC1 type
 +        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
 +        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
 +
 +        //! Performs a per-element multiplication of two Fourier spectrums.
 +        //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
 +        //! support only CV_32FC2 type
 +        CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false);
 +
 +        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code, int dcn = 0);
 +
 +        //! initializes a scaled identity matrix
 +        CV_EXPORTS void setIdentity(oclMat& src, const Scalar & val = Scalar(1));
 +
 +        //! fills the output array with repeated copies of the input array
 +        CV_EXPORTS void repeat(const oclMat & src, int ny, int nx, oclMat & dst);
 +
 +        //////////////////////////////// Filter Engine ////////////////////////////////
 +
 +        /*!
 +          The Base Class for 1D or Row-wise Filters
 +
 +          This is the base class for linear or non-linear filters that process 1D data.
 +          In particular, such filters are used for the "horizontal" filtering parts in separable filters.
 +          */
 +        class CV_EXPORTS BaseRowFilter_GPU
 +        {
 +        public:
 +            BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
 +            virtual ~BaseRowFilter_GPU() {}
 +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
 +            int ksize, anchor, bordertype;
 +        };
 +
 +        /*!
 +          The Base Class for Column-wise Filters
 +
 +          This is the base class for linear or non-linear filters that process columns of 2D arrays.
 +          Such filters are used for the "vertical" filtering parts in separable filters.
 +          */
 +        class CV_EXPORTS BaseColumnFilter_GPU
 +        {
 +        public:
 +            BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
 +            virtual ~BaseColumnFilter_GPU() {}
 +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
 +            int ksize, anchor, bordertype;
 +        };
 +
 +        /*!
 +          The Base Class for Non-Separable 2D Filters.
 +
 +          This is the base class for linear or non-linear 2D filters.
 +          */
 +        class CV_EXPORTS BaseFilter_GPU
 +        {
 +        public:
 +            BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
 +                : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
 +            virtual ~BaseFilter_GPU() {}
 +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
 +            Size ksize;
 +            Point anchor;
 +            int borderType;
 +        };
 +
 +        /*!
 +          The Base Class for Filter Engine.
 +
 +          The class can be used to apply an arbitrary filtering operation to an image.
 +          It contains all the necessary intermediate buffers.
 +          */
 +        class CV_EXPORTS FilterEngine_GPU
 +        {
 +        public:
 +            virtual ~FilterEngine_GPU() {}
 +
 +            virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
 +        };
 +
 +        //! returns the non-separable filter engine with the specified filter
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
 +
 +        //! returns the primitive row filter with the specified kernel
 +        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel,
 +                int anchor = -1, int bordertype = BORDER_DEFAULT);
 +
 +        //! returns the primitive column filter with the specified kernel
 +        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel,
 +                int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0);
 +
 +        //! returns the separable linear filter engine
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
 +                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! returns the separable filter engine with the specified filters
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
 +                const Ptr<BaseColumnFilter_GPU> &columnFilter);
 +
 +        //! returns the Gaussian filter engine
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
 +
 +        //! returns filter engine for the generalized Sobel operator
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
 +
 +        //! applies Laplacian operator to the image
 +        // supports only ksize = 1 and ksize = 3
 +        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1,
 +                double delta=0, int borderType=BORDER_DEFAULT);
 +
 +        //! returns 2D box filter
 +        // dst type must be the same as source type
 +        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
 +                const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns box filter engine
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size &ksize,
 +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns 2D filter with the specified kernel
 +        // supports: dst type must be the same as source type
 +        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
 +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns the non-separable linear filter engine
 +        // supports: dst type must be the same as source type
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
 +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! smooths the image using the normalized box filter
 +        CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
 +                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 +
 +        //! returns 2D morphological filter
 +        //! only MORPH_ERODE and MORPH_DILATE are supported
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        // kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
 +        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize,
 +                Point anchor = Point(-1, -1));
 +
 +        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
 +        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat &kernel,
 +                const Point &anchor = Point(-1, -1), int iterations = 1);
 +
 +        //! a synonym for normalized box filter
 +        static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
 +                                int borderType = BORDER_CONSTANT)
 +        {
 +            boxFilter(src, dst, -1, ksize, anchor, borderType);
 +        }
 +
 +        //! applies non-separable 2D linear filter to the image
 +        CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
 +                                 Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT);
 +
 +        //! applies separable 2D linear filter to the image
 +        CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
 +                                    Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! applies generalized Sobel operator to the image
 +        // dst.type must equalize src.type
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        CV_EXPORTS void Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! applies the vertical or horizontal Scharr operator to the image
 +        // dst.type must equalize src.type
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        CV_EXPORTS void Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
 +
 +        //! smooths the image using Gaussian filter.
 +        // dst.type must equalize src.type
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
 +        CV_EXPORTS void GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
 +
 +        //! erodes the image (applies the local minimum operator)
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
 +
 +                               int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
 +
 +
 +        //! dilates the image (applies the local maximum operator)
 +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
 +        CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
 +
 +                                int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
 +
 +
 +        //! applies an advanced morphological operation to the image
 +        CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
 +
 +                                      int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
 +
 +
 +        ////////////////////////////// Image processing //////////////////////////////
 +        //! Does mean shift filtering on GPU.
 +        CV_EXPORTS void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr,
 +                                           TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 +
 +        //! Does mean shift procedure on GPU.
 +        CV_EXPORTS void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr,
 +                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 +
 +        //! Does mean shift segmentation with elimiation of small regions.
 +        CV_EXPORTS void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize,
 +                                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 +
 +        //! applies fixed threshold to the image.
 +        // supports CV_8UC1 and CV_32FC1 data type
 +        // supports threshold type: THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV
 +        CV_EXPORTS double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC);
 +
 +        //! resizes the image
 +        // Supports INTER_NEAREST, INTER_LINEAR
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        CV_EXPORTS void resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
 +
 +        //! Applies a generic geometrical transformation to an image.
 +
 +        // Supports INTER_NEAREST, INTER_LINEAR.
 +        // Map1 supports CV_16SC2, CV_32FC2  types.
 +        // Src supports CV_8UC1, CV_8UC2, CV_8UC4.
 +        CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar());
 +
 +        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
 +        // supports CV_8UC1, CV_8UC4, CV_32SC1 types
 +        CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar());
 +
 +        //! Smoothes image using median filter
 +        // The source 1- or 4-channel image. m should be 3 or 5, the image depth should be CV_8U or CV_32F.
 +        CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m);
 +
 +        //! warps the image using affine transformation
 +        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        CV_EXPORTS void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
 +
 +        //! warps the image using perspective transformation
 +        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
 +        CV_EXPORTS void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
 +
 +        //! computes the integral image and integral for the squared image
-         // sum will have CV_32S type, sqsum - CV32F type
++        // sum will support CV_32S, CV_32F, sqsum - support CV32F, CV_64F
 +        // supports only CV_8UC1 source type
-         CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
-         CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
++        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum, int sdepth=-1 );
++        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, int sdepth=-1 );
 +        CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
 +        CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
 +            int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
 +        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 +        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
 +            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
 +
 +
 +        /////////////////////////////////// ML ///////////////////////////////////////////
 +
 +        //! Compute closest centers for each lines in source and lable it after center's index
 +        // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
 +        // supports NORM_L1 and NORM_L2 distType
 +        // if indices is provided, only the indexed rows will be calculated and their results are in the same
 +        // order of indices
 +        CV_EXPORTS void distanceToCenters(const oclMat &src, const oclMat &centers, Mat &dists, Mat &labels, int distType = NORM_L2SQR);
 +
 +        //!Does k-means procedure on GPU
 +        // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
 +        CV_EXPORTS double kmeans(const oclMat &src, int K, oclMat &bestLabels,
 +                                     TermCriteria criteria, int attemps, int flags, oclMat &centers);
 +
 +
 +        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 +        ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
 +        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 +        class CV_EXPORTS OclCascadeClassifier : public  cv::CascadeClassifier
 +        {
 +        public:
 +            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
 +                double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
 +                Size minSize = Size(), Size maxSize = Size());
 +        };
 +
 +        /////////////////////////////// Pyramid /////////////////////////////////////
 +        CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
 +
 +        //! upsamples the source image and then smoothes it
 +        CV_EXPORTS void pyrUp(const oclMat &src, oclMat &dst);
 +
 +        //! performs linear blending of two images
 +        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
 +        // supports only CV_8UC1 source type
 +        CV_EXPORTS void blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2, oclMat &result);
 +
 +        //! computes vertical sum, supports only CV_32FC1 images
 +        CV_EXPORTS void columnSum(const oclMat &src, oclMat &sum);
 +
 +        ///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
 +        struct CV_EXPORTS MatchTemplateBuf
 +        {
 +            Size user_block_size;
 +            oclMat imagef, templf;
 +            std::vector<oclMat> images;
 +            std::vector<oclMat> image_sums;
 +            std::vector<oclMat> image_sqsums;
 +        };
 +
 +        //! computes the proximity map for the raster template and the image where the template is searched for
 +        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
 +        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 +        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method);
 +
 +        //! computes the proximity map for the raster template and the image where the template is searched for
 +        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
 +        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 +        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf);
 +
 +
 +
 +        ///////////////////////////////////////////// Canny /////////////////////////////////////////////
 +        struct CV_EXPORTS CannyBuf;
 +
 +        //! compute edges of the input image using Canny operator
 +        // Support CV_8UC1 only
 +        CV_EXPORTS void Canny(const oclMat &image, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 +        CV_EXPORTS void Canny(const oclMat &image, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 +        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
 +        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
 +
 +        struct CV_EXPORTS CannyBuf
 +        {
 +            CannyBuf() : counter(1, 1, CV_32S) { }
 +            ~CannyBuf()
 +            {
 +                release();
 +            }
 +            explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S)
 +            {
 +                create(image_size, apperture_size);
 +            }
 +            CannyBuf(const oclMat &dx_, const oclMat &dy_);
 +            void create(const Size &image_size, int apperture_size = 3);
 +            void release();
 +
 +            oclMat dx, dy;
 +            oclMat dx_buf, dy_buf;
 +            oclMat magBuf, mapBuf;
 +            oclMat trackBuf1, trackBuf2;
 +            oclMat counter;
 +            Ptr<FilterEngine_GPU> filterDX, filterDY;
 +        };
 +
 +        ///////////////////////////////////////// Hough Transform /////////////////////////////////////////
 +        //! HoughCircles
 +        struct HoughCirclesBuf
 +        {
 +            oclMat edges;
 +            oclMat accum;
 +            oclMat srcPoints;
 +            oclMat centers;
 +            CannyBuf cannyBuf;
 +        };
 +
 +        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
 +        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
 +        CV_EXPORTS void HoughCirclesDownload(const oclMat& d_circles, OutputArray h_circles);
 +
 +
 +        ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
 +        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
 +        //! Param dft_size is the size of DFT transform.
 +        //!
 +        //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format.
 +        // support src type of CV32FC1, CV32FC2
 +        // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS
 +        // dft_size is the size of original input, which is used for transformation from complex to real.
 +        // dft_size must be powers of 2, 3 and 5
 +        // real to complex dft requires at least v1.8 clAmdFft
 +        // real to complex dft output is not the same with cpu version
 +        // real to complex and complex to real does not support DFT_ROWS
 +        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);
 +
 +        //! implements generalized matrix product algorithm GEMM from BLAS
 +        // The functionality requires clAmdBlas library
 +        // only support type CV_32FC1
 +        // flag GEMM_3_T is not supported
 +        CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha,
 +                             const oclMat &src3, double beta, oclMat &dst, int flags = 0);
 +
 +        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
 +
 +        struct CV_EXPORTS HOGDescriptor
 +
 +        {
 +
 +            enum { DEFAULT_WIN_SIGMA = -1 };
 +
 +            enum { DEFAULT_NLEVELS = 64 };
 +
 +            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
 +
 +
 +
 +            HOGDescriptor(Size win_size = Size(64, 128), Size block_size = Size(16, 16),
 +
 +                          Size block_stride = Size(8, 8), Size cell_size = Size(8, 8),
 +
 +                          int nbins = 9, double win_sigma = DEFAULT_WIN_SIGMA,
 +
 +                          double threshold_L2hys = 0.2, bool gamma_correction = true,
 +
 +                          int nlevels = DEFAULT_NLEVELS);
 +
 +
 +
 +            size_t getDescriptorSize() const;
 +
 +            size_t getBlockHistogramSize() const;
 +
 +
 +
 +            void setSVMDetector(const std::vector<float> &detector);
 +
 +
 +
 +            static std::vector<float> getDefaultPeopleDetector();
 +
 +            static std::vector<float> getPeopleDetector48x96();
 +
 +            static std::vector<float> getPeopleDetector64x128();
 +
 +
 +
 +            void detect(const oclMat &img, std::vector<Point> &found_locations,
 +
 +                        double hit_threshold = 0, Size win_stride = Size(),
 +
 +                        Size padding = Size());
 +
 +
 +
 +            void detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations,
 +
 +                                  double hit_threshold = 0, Size win_stride = Size(),
 +
 +                                  Size padding = Size(), double scale0 = 1.05,
 +
 +                                  int group_threshold = 2);
 +
 +
 +
 +            void getDescriptors(const oclMat &img, Size win_stride,
 +
 +                                oclMat &descriptors,
 +
 +                                int descr_format = DESCR_FORMAT_COL_BY_COL);
 +
 +
 +
 +            Size win_size;
 +
 +            Size block_size;
 +
 +            Size block_stride;
 +
 +            Size cell_size;
 +
 +            int nbins;
 +
 +            double win_sigma;
 +
 +            double threshold_L2hys;
 +
 +            bool gamma_correction;
 +
 +            int nlevels;
 +
 +
 +
 +        protected:
 +
 +            // initialize buffers; only need to do once in case of multiscale detection
 +
 +            void init_buffer(const oclMat &img, Size win_stride);
 +
 +
 +
 +            void computeBlockHistograms(const oclMat &img);
 +
 +            void computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle);
 +
 +
 +
 +            double getWinSigma() const;
 +
 +            bool checkDetectorSize() const;
 +
 +
 +
 +            static int numPartsWithin(int size, int part_size, int stride);
 +
 +            static Size numPartsWithin(Size size, Size part_size, Size stride);
 +
 +
 +
 +            // Coefficients of the separating plane
 +
 +            float free_coef;
 +
 +            oclMat detector;
 +
 +
 +
 +            // Results of the last classification step
 +
 +            oclMat labels;
 +
 +            Mat labels_host;
 +
 +
 +
 +            // Results of the last histogram evaluation step
 +
 +            oclMat block_hists;
 +
 +
 +
 +            // Gradients conputation results
 +
 +            oclMat grad, qangle;
 +
 +
 +
 +            // scaled image
 +
 +            oclMat image_scale;
 +
 +
 +
 +            // effect size of input image (might be different from original size after scaling)
 +
 +            Size effect_size;
 +
 +        };
 +
 +
 +        ////////////////////////feature2d_ocl/////////////////
 +        /****************************************************************************************\
 +        *                                      Distance                                          *
 +        \****************************************************************************************/
 +        template<typename T>
 +        struct CV_EXPORTS Accumulator
 +        {
 +            typedef T Type;
 +        };
 +        template<> struct Accumulator<unsigned char>
 +        {
 +            typedef float Type;
 +        };
 +        template<> struct Accumulator<unsigned short>
 +        {
 +            typedef float Type;
 +        };
 +        template<> struct Accumulator<char>
 +        {
 +            typedef float Type;
 +        };
 +        template<> struct Accumulator<short>
 +        {
 +            typedef float Type;
 +        };
 +
 +        /*
 +         * Manhattan distance (city block distance) functor
 +         */
 +        template<class T>
 +        struct CV_EXPORTS L1
 +        {
 +            enum { normType = NORM_L1 };
 +            typedef T ValueType;
 +            typedef typename Accumulator<T>::Type ResultType;
 +
 +            ResultType operator()( const T *a, const T *b, int size ) const
 +            {
 +                return normL1<ValueType, ResultType>(a, b, size);
 +            }
 +        };
 +
 +        /*
 +         * Euclidean distance functor
 +         */
 +        template<class T>
 +        struct CV_EXPORTS L2
 +        {
 +            enum { normType = NORM_L2 };
 +            typedef T ValueType;
 +            typedef typename Accumulator<T>::Type ResultType;
 +
 +            ResultType operator()( const T *a, const T *b, int size ) const
 +            {
 +                return (ResultType)std::sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
 +            }
 +        };
 +
 +        /*
 +         * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
 +         * bit count of A exclusive XOR'ed with B
 +         */
 +        struct CV_EXPORTS Hamming
 +        {
 +            enum { normType = NORM_HAMMING };
 +            typedef unsigned char ValueType;
 +            typedef int ResultType;
 +
 +            /** this will count the bits in a ^ b
 +             */
 +            ResultType operator()( const unsigned char *a, const unsigned char *b, int size ) const
 +            {
 +                return normHamming(a, b, size);
 +            }
 +        };
 +
 +        ////////////////////////////////// BruteForceMatcher //////////////////////////////////
 +
 +        class CV_EXPORTS BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            enum DistType {L1Dist = 0, L2Dist, HammingDist};
 +            explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
 +
 +            // Add descriptors to train descriptor collection
 +            void add(const std::vector<oclMat> &descCollection);
 +
 +            // Get train descriptors collection
 +            const std::vector<oclMat> &getTrainDescriptors() const;
 +
 +            // Clear train descriptors collection
 +            void clear();
 +
 +            // Return true if there are not train descriptors in collection
 +            bool empty() const;
 +
 +            // Return true if the matcher supports mask in match methods
 +            bool isMaskSupported() const;
 +
 +            // Find one best match for each query descriptor
 +            void matchSingle(const oclMat &query, const oclMat &train,
 +                             oclMat &trainIdx, oclMat &distance,
 +                             const oclMat &mask = oclMat());
 +
 +            // Download trainIdx and distance and convert it to CPU vector with DMatch
 +            static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
 +            // Convert trainIdx and distance to vector with DMatch
 +            static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
 +
 +            // Find one best match for each query descriptor
 +            void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
 +
 +            // Make gpu collection of trains and masks in suitable format for matchCollection function
 +            void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
 +
 +            // Find one best match from train collection for each query descriptor
 +            void matchCollection(const oclMat &query, const oclMat &trainCollection,
 +                                 oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
 +                                 const oclMat &masks = oclMat());
 +
 +            // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
 +            static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
 +            // Convert trainIdx, imgIdx and distance to vector with DMatch
 +            static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
 +
 +            // Find one best match from train collection for each query descriptor.
 +            void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
 +
 +            // Find k best matches for each query descriptor (in increasing order of distances)
 +            void knnMatchSingle(const oclMat &query, const oclMat &train,
 +                                oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
 +                                const oclMat &mask = oclMat());
 +
 +            // Download trainIdx and distance and convert it to vector with DMatch
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
 +                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx and distance to vector with DMatch
 +            static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
 +                                        std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find k best matches for each query descriptor (in increasing order of distances).
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            void knnMatch(const oclMat &query, const oclMat &train,
 +                          std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
 +                          bool compactResult = false);
 +
 +            // Find k best matches from train collection for each query descriptor (in increasing order of distances)
 +            void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
 +                                     oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
 +                                     const oclMat &maskCollection = oclMat());
 +
 +            // Download trainIdx and distance and convert it to vector with DMatch
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
 +                                          std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx and distance to vector with DMatch
 +            static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
 +                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find k best matches  for each query descriptor (in increasing order of distances).
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
 +                          const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
 +
 +            // Find best matches for each query descriptor which have distance less than maxDistance.
 +            // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
 +            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
 +            // because it didn't have enough memory.
 +            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
 +            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
 +            // Matches doesn't sorted.
 +            void radiusMatchSingle(const oclMat &query, const oclMat &train,
 +                                   oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
 +                                   const oclMat &mask = oclMat());
 +
 +            // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
 +            // matches will be sorted in increasing order of distances.
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
 +                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx, nMatches and distance to vector with DMatch.
 +            static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
 +                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find best matches for each query descriptor which have distance less than maxDistance
 +            // in increasing order of distances).
 +            void radiusMatch(const oclMat &query, const oclMat &train,
 +                             std::vector< std::vector<DMatch> > &matches, float maxDistance,
 +                             const oclMat &mask = oclMat(), bool compactResult = false);
 +
 +            // Find best matches for each query descriptor which have distance less than maxDistance.
 +            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
 +            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
 +            // Matches doesn't sorted.
 +            void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
 +                                       const std::vector<oclMat> &masks = std::vector<oclMat>());
 +
 +            // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
 +            // matches will be sorted in increasing order of distances.
 +            // compactResult is used when mask is not empty. If compactResult is false matches
 +            // vector will have the same size as queryDescriptors rows. If compactResult is true
 +            // matches vector will not contain matches for fully masked out query descriptors.
 +            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
 +                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +            // Convert trainIdx, nMatches and distance to vector with DMatch.
 +            static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
 +                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
 +
 +            // Find best matches from train collection for each query descriptor which have distance less than
 +            // maxDistance (in increasing order of distances).
 +            void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
 +                             const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
 +
 +            DistType distType;
 +
 +        private:
 +            std::vector<oclMat> trainDescCollection;
 +        };
 +
 +        template <class Distance>
 +        class CV_EXPORTS BruteForceMatcher_OCL;
 +
 +        template <typename T>
 +        class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
 +            explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
 +        };
 +        template <typename T>
 +        class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
 +            explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
 +        };
 +        template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
 +            explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
 +        };
 +
 +        class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
 +        {
 +        public:
 +            explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
 +        };
 +
 +        class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
 +        {
 +        public:
 +            explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
 +                int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
 +
 +            //! return 1 rows matrix with CV_32FC2 type
 +            void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
 +            //! download points of type Point2f to a vector. the vector's content will be erased
 +            void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
 +
 +            int maxCorners;
 +            double qualityLevel;
 +            double minDistance;
 +
 +            int blockSize;
 +            bool useHarrisDetector;
 +            double harrisK;
 +            void releaseMemory()
 +            {
 +                Dx_.release();
 +                Dy_.release();
 +                eig_.release();
 +                minMaxbuf_.release();
 +                tmpCorners_.release();
 +            }
 +        private:
 +            oclMat Dx_;
 +            oclMat Dy_;
 +            oclMat eig_;
 +            oclMat minMaxbuf_;
 +            oclMat tmpCorners_;
 +        };
 +
 +        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
 +            int blockSize_, bool useHarrisDetector_, double harrisK_)
 +        {
 +            maxCorners = maxCorners_;
 +            qualityLevel = qualityLevel_;
 +            minDistance = minDistance_;
 +            blockSize = blockSize_;
 +            useHarrisDetector = useHarrisDetector_;
 +            harrisK = harrisK_;
 +        }
 +
 +        /////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
 +
 +        class CV_EXPORTS PyrLKOpticalFlow
 +        {
 +        public:
 +            PyrLKOpticalFlow()
 +            {
 +                winSize = Size(21, 21);
 +                maxLevel = 3;
 +                iters = 30;
 +                derivLambda = 0.5;
 +                useInitialFlow = false;
 +                minEigThreshold = 1e-4f;
 +                getMinEigenVals = false;
 +                isDeviceArch11_ = false;
 +            }
 +
 +            void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
 +                        oclMat &status, oclMat *err = 0);
 +
 +            void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
 +
 +            Size winSize;
 +            int maxLevel;
 +            int iters;
 +            double derivLambda;
 +            bool useInitialFlow;
 +            float minEigThreshold;
 +            bool getMinEigenVals;
 +
 +            void releaseMemory()
 +            {
 +                dx_calcBuf_.release();
 +                dy_calcBuf_.release();
 +
 +                prevPyr_.clear();
 +                nextPyr_.clear();
 +
 +                dx_buf_.release();
 +                dy_buf_.release();
 +            }
 +
 +        private:
 +            void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
 +
 +            void buildImagePyramid(const oclMat &img0, std::vector<oclMat> &pyr, bool withBorder);
 +
 +            oclMat dx_calcBuf_;
 +            oclMat dy_calcBuf_;
 +
 +            std::vector<oclMat> prevPyr_;
 +            std::vector<oclMat> nextPyr_;
 +
 +            oclMat dx_buf_;
 +            oclMat dy_buf_;
 +
 +            oclMat uPyr_[2];
 +            oclMat vPyr_[2];
 +
 +            bool isDeviceArch11_;
 +        };
 +
 +        class CV_EXPORTS FarnebackOpticalFlow
 +        {
 +        public:
 +            FarnebackOpticalFlow();
 +
 +            int numLevels;
 +            double pyrScale;
 +            bool fastPyramids;
 +            int winSize;
 +            int numIters;
 +            int polyN;
 +            double polySigma;
 +            int flags;
 +
 +            void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy);
 +
 +            void releaseMemory();
 +
 +        private:
 +            void prepareGaussian(
 +                int n, double sigma, float *g, float *xg, float *xxg,
 +                double &ig11, double &ig03, double &ig33, double &ig55);
 +
 +            void setPolynomialExpansionConsts(int n, double sigma);
 +
 +            void updateFlow_boxFilter(
 +                const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat &flowy,
 +                oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices);
 +
 +            void updateFlow_gaussianBlur(
 +                const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat& flowy,
 +                oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices);
 +
 +            oclMat frames_[2];
 +            oclMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
 +            std::vector<oclMat> pyramid0_, pyramid1_;
 +        };
 +
 +        //////////////// build warping maps ////////////////////
 +        //! builds plane warping maps
 +        CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, const Mat &T, float scale, oclMat &map_x, oclMat &map_y);
 +        //! builds cylindrical warping maps
 +        CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
 +        //! builds spherical warping maps
 +        CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
 +        //! builds Affine warping maps
 +        CV_EXPORTS void buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
 +
 +        //! builds Perspective warping maps
 +        CV_EXPORTS void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
 +
 +        ///////////////////////////////////// interpolate frames //////////////////////////////////////////////
 +        //! Interpolate frames (images) using provided optical flow (displacement field).
 +        //! frame0   - frame 0 (32-bit floating point images, single channel)
 +        //! frame1   - frame 1 (the same type and size)
 +        //! fu       - forward horizontal displacement
 +        //! fv       - forward vertical displacement
 +        //! bu       - backward horizontal displacement
 +        //! bv       - backward vertical displacement
 +        //! pos      - new frame position
 +        //! newFrame - new frame
 +        //! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat;
 +        //!            occlusion masks            0, occlusion masks            1,
 +        //!            interpolated forward flow  0, interpolated forward flow  1,
 +        //!            interpolated backward flow 0, interpolated backward flow 1
 +        //!
 +        CV_EXPORTS void interpolateFrames(const oclMat &frame0, const oclMat &frame1,
 +                                          const oclMat &fu, const oclMat &fv,
 +                                          const oclMat &bu, const oclMat &bv,
 +                                          float pos, oclMat &newFrame, oclMat &buf);
 +
 +        //! computes moments of the rasterized shape or a vector of points
 +        //! _array should be a vector a points standing for the contour
 +        CV_EXPORTS Moments ocl_moments(InputArray contour);
 +        //! src should be a general image uploaded to the GPU.
 +        //! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
 +        //! to use type of CV_64FC1, the GPU should support CV_64FC1
 +        CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary);
 +
 +        class CV_EXPORTS StereoBM_OCL
 +        {
 +        public:
 +            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
 +
 +            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
 +
 +            //! the default constructor
 +            StereoBM_OCL();
 +            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
 +            StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
 +
 +            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
 +            //! Output disparity has CV_8U type.
 +            void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
 +
 +            //! Some heuristics that tries to estmate
 +            // if current GPU will be faster then CPU in this algorithm.
 +            // It queries current active device.
 +            static bool checkIfGpuCallReasonable();
 +
 +            int preset;
 +            int ndisp;
 +            int winSize;
 +
 +            // If avergeTexThreshold  == 0 => post procesing is disabled
 +            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
 +            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
 +            // i.e. input left image is low textured.
 +            float avergeTexThreshold;
 +        private:
 +            oclMat minSSD, leBuf, riBuf;
 +        };
 +
 +        class CV_EXPORTS StereoBeliefPropagation
 +        {
 +        public:
 +            enum { DEFAULT_NDISP  = 64 };
 +            enum { DEFAULT_ITERS  = 5  };
 +            enum { DEFAULT_LEVELS = 5  };
 +            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
 +            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
 +                                             int iters  = DEFAULT_ITERS,
 +                                             int levels = DEFAULT_LEVELS,
 +                                             int msg_type = CV_16S);
 +            StereoBeliefPropagation(int ndisp, int iters, int levels,
 +                                    float max_data_term, float data_weight,
 +                                    float max_disc_term, float disc_single_jump,
 +                                    int msg_type = CV_32F);
 +            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
 +            void operator()(const oclMat &data, oclMat &disparity);
 +            int ndisp;
 +            int iters;
 +            int levels;
 +            float max_data_term;
 +            float data_weight;
 +            float max_disc_term;
 +            float disc_single_jump;
 +            int msg_type;
 +        private:
 +            oclMat u, d, l, r, u2, d2, l2, r2;
 +            std::vector<oclMat> datas;
 +            oclMat out;
 +        };
 +
 +        class CV_EXPORTS StereoConstantSpaceBP
 +        {
 +        public:
 +            enum { DEFAULT_NDISP    = 128 };
 +            enum { DEFAULT_ITERS    = 8   };
 +            enum { DEFAULT_LEVELS   = 4   };
 +            enum { DEFAULT_NR_PLANE = 4   };
 +            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
 +            explicit StereoConstantSpaceBP(
 +                int ndisp    = DEFAULT_NDISP,
 +                int iters    = DEFAULT_ITERS,
 +                int levels   = DEFAULT_LEVELS,
 +                int nr_plane = DEFAULT_NR_PLANE,
 +                int msg_type = CV_32F);
 +            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
 +                float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
 +                int min_disp_th = 0,
 +                int msg_type = CV_32F);
 +            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
 +            int ndisp;
 +            int iters;
 +            int levels;
 +            int nr_plane;
 +            float max_data_term;
 +            float data_weight;
 +            float max_disc_term;
 +            float disc_single_jump;
 +            int min_disp_th;
 +            int msg_type;
 +            bool use_local_init_data_cost;
 +        private:
 +            oclMat u[2], d[2], l[2], r[2];
 +            oclMat disp_selected_pyr[2];
 +            oclMat data_cost;
 +            oclMat data_cost_selected;
 +            oclMat temp;
 +            oclMat out;
 +        };
 +
 +        // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
 +        //
 +        // see reference:
 +        //   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
 +        //   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
 +        class CV_EXPORTS OpticalFlowDual_TVL1_OCL
 +        {
 +        public:
 +            OpticalFlowDual_TVL1_OCL();
 +
 +            void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
 +
 +            void collectGarbage();
 +
 +            /**
 +            * Time step of the numerical scheme.
 +            */
 +            double tau;
 +
 +            /**
 +            * Weight parameter for the data term, attachment parameter.
 +            * This is the most relevant parameter, which determines the smoothness of the output.
 +            * The smaller this parameter is, the smoother the solutions we obtain.
 +            * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
 +            */
 +            double lambda;
 +
 +            /**
 +            * Weight parameter for (u - v)^2, tightness parameter.
 +            * It serves as a link between the attachment and the regularization terms.
 +            * In theory, it should have a small value in order to maintain both parts in correspondence.
 +            * The method is stable for a large range of values of this parameter.
 +            */
 +            double theta;
 +
 +            /**
 +            * Number of scales used to create the pyramid of images.
 +            */
 +            int nscales;
 +
 +            /**
 +            * Number of warpings per scale.
 +            * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
 +            * This is a parameter that assures the stability of the method.
 +            * It also affects the running time, so it is a compromise between speed and accuracy.
 +            */
 +            int warps;
 +
 +            /**
 +            * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
 +            * A small value will yield more accurate solutions at the expense of a slower convergence.
 +            */
 +            double epsilon;
 +
 +            /**
 +            * Stopping criterion iterations number used in the numerical scheme.
 +            */
 +            int iterations;
 +
 +            bool useInitialFlow;
 +
 +        private:
 +            void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
 +
 +            std::vector<oclMat> I0s;
 +            std::vector<oclMat> I1s;
 +            std::vector<oclMat> u1s;
 +            std::vector<oclMat> u2s;
 +
 +            oclMat I1x_buf;
 +            oclMat I1y_buf;
 +
 +            oclMat I1w_buf;
 +            oclMat I1wx_buf;
 +            oclMat I1wy_buf;
 +
 +            oclMat grad_buf;
 +            oclMat rho_c_buf;
 +
 +            oclMat p11_buf;
 +            oclMat p12_buf;
 +            oclMat p21_buf;
 +            oclMat p22_buf;
 +
 +            oclMat diff_buf;
 +            oclMat norm_buf;
 +        };
 +        // current supported sorting methods
 +        enum
 +        {
 +            SORT_BITONIC,   // only support power-of-2 buffer size
 +            SORT_SELECTION, // cannot sort duplicate keys
 +            SORT_MERGE,
 +            SORT_RADIX      // only support signed int/float keys(CV_32S/CV_32F)
 +        };
 +        //! Returns the sorted result of all the elements in input based on equivalent keys.
 +        //
 +        //  The element unit in the values to be sorted is determined from the data type,
 +        //  i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its
 +        //  matrix dimension.
 +        //  both keys and values will be sorted inplace
 +        //  Key needs to be single channel oclMat.
 +        //
 +        //  Example:
 +        //  input -
 +        //    keys   = {2,    3,   1}   (CV_8UC1)
 +        //    values = {10,5, 4,3, 6,2} (CV_8UC2)
 +        //  sortByKey(keys, values, SORT_SELECTION, false);
 +        //  output -
 +        //    keys   = {1,    2,   3}   (CV_8UC1)
 +        //    values = {6,2, 10,5, 4,3} (CV_8UC2)
 +        CV_EXPORTS void sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false);
 +        /*!Base class for MOG and MOG2!*/
 +        class CV_EXPORTS BackgroundSubtractor
 +        {
 +        public:
 +            //! the virtual destructor
 +            virtual ~BackgroundSubtractor();
 +            //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image.
 +            virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate);
 +
 +            //! computes a background image
 +            virtual void getBackgroundImage(oclMat& backgroundImage) const = 0;
 +        };
 +                /*!
 +        Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm
 +
 +        The class implements the following algorithm:
 +        "An improved adaptive background mixture model for real-time tracking with shadow detection"
 +        P. KadewTraKuPong and R. Bowden,
 +        Proc. 2nd European Workshp on Advanced Video-Based Surveillance Systems, 2001."
 +        http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/avbs01/avbs01.pdf
 +        */
 +        class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor
 +        {
 +        public:
 +            //! the default constructor
 +            MOG(int nmixtures = -1);
 +
 +            //! re-initiaization method
 +            void initialize(Size frameSize, int frameType);
 +
 +            //! the update operator
 +            void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f);
 +
 +            //! computes a background image which are the mean of all background gaussians
 +            void getBackgroundImage(oclMat& backgroundImage) const;
 +
 +            //! releases all inner buffers
 +            void release();
 +
 +            int history;
 +            float varThreshold;
 +            float backgroundRatio;
 +            float noiseSigma;
 +
 +        private:
 +            int nmixtures_;
 +
 +            Size frameSize_;
 +            int frameType_;
 +            int nframes_;
 +
 +            oclMat weight_;
 +            oclMat sortKey_;
 +            oclMat mean_;
 +            oclMat var_;
 +        };
 +
 +        /*!
 +        The class implements the following algorithm:
 +        "Improved adaptive Gausian mixture model for background subtraction"
 +        Z.Zivkovic
 +        International Conference Pattern Recognition, UK, August, 2004.
 +        http://www.zoranz.net/Publications/zivkovic2004ICPR.pdf
 +        */
 +        class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor
 +        {
 +        public:
 +            //! the default constructor
 +            MOG2(int nmixtures = -1);
 +
 +            //! re-initiaization method
 +            void initialize(Size frameSize, int frameType);
 +
 +            //! the update operator
 +            void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f);
 +
 +            //! computes a background image which are the mean of all background gaussians
 +            void getBackgroundImage(oclMat& backgroundImage) const;
 +
 +            //! releases all inner buffers
 +            void release();
 +
 +            // parameters
 +            // you should call initialize after parameters changes
 +
 +            int history;
 +
 +            //! here it is the maximum allowed number of mixture components.
 +            //! Actual number is determined dynamically per pixel
 +            float varThreshold;
 +            // threshold on the squared Mahalanobis distance to decide if it is well described
 +            // by the background model or not. Related to Cthr from the paper.
 +            // This does not influence the update of the background. A typical value could be 4 sigma
 +            // and that is varThreshold=4*4=16; Corresponds to Tb in the paper.
 +
 +            /////////////////////////
 +            // less important parameters - things you might change but be carefull
 +            ////////////////////////
 +
 +            float backgroundRatio;
 +            // corresponds to fTB=1-cf from the paper
 +            // TB - threshold when the component becomes significant enough to be included into
 +            // the background model. It is the TB=1-cf from the paper. So I use cf=0.1 => TB=0.
 +            // For alpha=0.001 it means that the mode should exist for approximately 105 frames before
 +            // it is considered foreground
 +            // float noiseSigma;
 +            float varThresholdGen;
 +
 +            //correspondts to Tg - threshold on the squared Mahalan. dist. to decide
 +            //when a sample is close to the existing components. If it is not close
 +            //to any a new component will be generated. I use 3 sigma => Tg=3*3=9.
 +            //Smaller Tg leads to more generated components and higher Tg might make
 +            //lead to small number of components but they can grow too large
 +            float fVarInit;
 +            float fVarMin;
 +            float fVarMax;
 +
 +            //initial variance  for the newly generated components.
 +            //It will will influence the speed of adaptation. A good guess should be made.
 +            //A simple way is to estimate the typical standard deviation from the images.
 +            //I used here 10 as a reasonable value
 +            // min and max can be used to further control the variance
 +            float fCT; //CT - complexity reduction prior
 +            //this is related to the number of samples needed to accept that a component
 +            //actually exists. We use CT=0.05 of all the samples. By setting CT=0 you get
 +            //the standard Stauffer&Grimson algorithm (maybe not exact but very similar)
 +
 +            //shadow detection parameters
 +            bool bShadowDetection; //default 1 - do shadow detection
 +            unsigned char nShadowDetection; //do shadow detection - insert this value as the detection result - 127 default value
 +            float fTau;
 +            // Tau - shadow threshold. The shadow is detected if the pixel is darker
 +            //version of the background. Tau is a threshold on how much darker the shadow can be.
 +            //Tau= 0.5 means that if pixel is more than 2 times darker then it is not shadow
 +            //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003.
 +
 +        private:
 +            int nmixtures_;
 +
 +            Size frameSize_;
 +            int frameType_;
 +            int nframes_;
 +
 +            oclMat weight_;
 +            oclMat variance_;
 +            oclMat mean_;
 +
 +            oclMat bgmodelUsedModes_; //keep track of number of modes per pixel
 +        };
 +
 +        /*!***************Kalman Filter*************!*/
 +        class CV_EXPORTS KalmanFilter
 +        {
 +        public:
 +            KalmanFilter();
 +            //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
 +            KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
 +            //! re-initializes Kalman filter. The previous content is destroyed.
 +            void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
 +
 +            const oclMat& predict(const oclMat& control=oclMat());
 +            const oclMat& correct(const oclMat& measurement);
 +
 +            oclMat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
 +            oclMat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
 +            oclMat transitionMatrix;   //!< state transition matrix (A)
 +            oclMat controlMatrix;      //!< control matrix (B) (not used if there is no control)
 +            oclMat measurementMatrix;  //!< measurement matrix (H)
 +            oclMat processNoiseCov;    //!< process noise covariance matrix (Q)
 +            oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
 +            oclMat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
 +            oclMat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
 +            oclMat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
 +        private:
 +            oclMat temp1;
 +            oclMat temp2;
 +            oclMat temp3;
 +            oclMat temp4;
 +            oclMat temp5;
 +        };
 +
 +        /*!***************K Nearest Neighbour*************!*/
 +        class CV_EXPORTS KNearestNeighbour: public CvKNearest
 +        {
 +        public:
 +            KNearestNeighbour();
 +            ~KNearestNeighbour();
 +
 +            bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
 +                bool isRegression = false, int max_k = 32, bool updateBase = false);
 +
 +            void clear();
 +
 +            void find_nearest(const oclMat& samples, int k, oclMat& lables);
 +
 +        private:
 +            oclMat samples_ocl;
 +        };
 +
 +        /*!***************  SVM  *************!*/
 +        class CV_EXPORTS CvSVM_OCL : public CvSVM
 +        {
 +        public:
 +            CvSVM_OCL();
 +
 +            CvSVM_OCL(const cv::Mat& trainData, const cv::Mat& responses,
 +                      const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
 +                      CvSVMParams params=CvSVMParams());
 +            CV_WRAP float predict( const int row_index, Mat& src, bool returnDFVal=false ) const;
 +            CV_WRAP void predict( cv::InputArray samples, cv::OutputArray results ) const;
 +            CV_WRAP float predict( const cv::Mat& sample, bool returnDFVal=false ) const;
 +            float predict( const CvMat* samples, CV_OUT CvMat* results ) const;
 +
 +        protected:
 +            float predict( const int row_index, int row_len, Mat& src, bool returnDFVal=false ) const;
 +            void create_kernel();
 +            void create_solver();
 +        };
 +
 +        /*!***************  END  *************!*/
 +    }
 +}
 +#if defined _MSC_VER && _MSC_VER >= 1200
 +#  pragma warning( push)
 +#  pragma warning( disable: 4267)
 +#endif
 +#include "opencv2/ocl/matrix_operations.hpp"
 +#if defined _MSC_VER && _MSC_VER >= 1200
 +#  pragma warning( pop)
 +#endif
 +
 +#endif /* __OPENCV_OCL_HPP__ */
diff --cc modules/ocl/perf/perf_match_template.cpp
index 68192cf,3ee038a..9c9829c
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@@ -112,9 -112,9 +112,9 @@@ PERF_TEST_P(CV_TM_CCORR_NORMEDFixture, 
      }
      else if (RUN_PLAIN_IMPL)
      {
 -        TEST_CYCLE() cv::matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
 +        TEST_CYCLE() cv::matchTemplate(src, templ, dst, TM_CCORR_NORMED);
  
-         SANITY_CHECK(dst, 2e-2);
+         SANITY_CHECK(dst, 3e-2);
      }
      else
          OCL_PERF_ELSE
diff --cc modules/ocl/src/arithm.cpp
index cd7aa0c,68c5269..0acb4c2
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@@ -865,32 -865,34 +865,34 @@@ void cv::ocl::log(const oclMat &src, oc
  ////////////////////////////// magnitude phase ///////////////////////////////
  //////////////////////////////////////////////////////////////////////////////
  
 -static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
 +static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName)
  {
-     int channels = dst.oclchannels();
      int depth = dst.depth();
  
-     size_t vector_length = 1;
-     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
-     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
- 
      size_t localThreads[3]  = { 64, 4, 1 };
-     size_t globalThreads[3] = { cols, dst.rows, 1 };
+     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+ 
+     int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize();
+     int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize();
+     int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
  
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
 +    std::vector<std::pair<size_t , const void *> > args;
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
  
-     openCLExecuteKernel(src1.clCxt, &arithm_magnitude, kernelName, globalThreads, localThreads, args, -1, depth);
+     const char * const channelMap[] = { "", "", "2", "4", "4" };
+     std::string buildOptions = format("-D T=%s%s", depth == CV_32F ? "float" : "double", channelMap[dst.channels()]);
+ 
+     openCLExecuteKernel(src1.clCxt, &arithm_magnitude, kernelName, globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
  }
  
  void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
@@@ -964,25 -966,29 +966,29 @@@ static void arithmetic_cartToPolar_run(
      size_t localThreads[3]  = { 64, 4, 1 };
      size_t globalThreads[3] = { cols, src1.rows, 1 };
  
-     int tmp = angleInDegrees ? 1 : 0;
+     int src1_step = src1.step / src1.elemSize1(), src1_offset = src1.offset / src1.elemSize1();
+     int src2_step = src2.step / src2.elemSize1(), src2_offset = src2.offset / src2.elemSize1();
+     int dst_mag_step = dst_mag.step / dst_mag.elemSize1(), dst_mag_offset = dst_mag.offset / dst_mag.elemSize1();
+     int dst_cart_step = dst_cart.step / dst_cart.elemSize1(), dst_cart_offset = dst_cart.offset / dst_cart.elemSize1();
+ 
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst_mag.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_mag_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_mag_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst_cart.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_cart_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_cart_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
 +    std::vector<std::pair<size_t , const void *> > args;
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_mag.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_cart.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&tmp ));
  
-     openCLExecuteKernel(src1.clCxt, &arithm_cartToPolar, kernelName, globalThreads, localThreads, args, -1, depth);
+     openCLExecuteKernel(src1.clCxt, &arithm_cartToPolar, kernelName, globalThreads, localThreads, args,
+                         -1, depth, angleInDegrees ? "-D DEGREE" : "-D RADIAN");
  }
  
  void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat &angle, bool angleInDegrees)
@@@ -1006,39 -1012,40 +1012,40 @@@
  //////////////////////////////////////////////////////////////////////////////
  
  static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
 -                        string kernelName)
 +                        String kernelName)
  {
-     int channels = src2.oclchannels();
-     int depth = src2.depth();
- 
-     int cols = src2.cols * channels;
-     int rows = src2.rows;
+     int channels = src2.oclchannels(), depth = src2.depth();
+     int cols = src2.cols * channels, rows = src2.rows;
  
      size_t localThreads[3]  = { 64, 4, 1 };
      size_t globalThreads[3] = { cols, rows, 1 };
  
-     int tmp = angleInDegrees ? 1 : 0;
+     int src1_step = src1.step / src1.elemSize1(), src1_offset = src1.offset / src1.elemSize1();
+     int src2_step = src2.step / src2.elemSize1(), src2_offset = src2.offset / src2.elemSize1();
+     int dst1_step = dst1.step / dst1.elemSize1(), dst1_offset = dst1.offset / dst1.elemSize1();
+     int dst2_step = dst2.step / dst2.elemSize1(), dst2_offset = dst2.offset / dst2.elemSize1();
+ 
 -    vector<pair<size_t , const void *> > args;
 +    std::vector<std::pair<size_t , const void *> > args;
      if (src1.data)
      {
 -        args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
 -        args.push_back( make_pair( sizeof(cl_int), (void *)&src1_offset ));
 +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-         args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
++        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
++        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
      }
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst1.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst1_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst1_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst2.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst2_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst2_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst1.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst2.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&tmp ));
  
-     openCLExecuteKernel(src1.clCxt, &arithm_polarToCart, kernelName, globalThreads, localThreads, args, -1, depth);
+     openCLExecuteKernel(src1.clCxt, &arithm_polarToCart, kernelName, globalThreads, localThreads,
+                         args, -1, depth, angleInDegrees ? "-D DEGREE" : "-D RADIAN");
  }
  
  void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees)
@@@ -1623,38 -1630,38 +1630,38 @@@ void cv::ocl::addWeighted(const oclMat 
  /////////////////////////////////// Pow //////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////
  
- static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
 -static void arithmetic_pow_run(const oclMat &src, double p, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
++static void arithmetic_pow_run(const oclMat &src, double p, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
  {
      int channels = dst.oclchannels();
      int depth = dst.depth();
  
-     size_t vector_length = 1;
-     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
-     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-     int rows = dst.rows;
- 
      size_t localThreads[3]  = { 64, 4, 1 };
-     size_t globalThreads[3] = { cols, rows, 1 };
+     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+ 
+     const char * const typeStr = depth == CV_32F ? "float" : "double";
+     const char * const channelMap[] = { "", "", "2", "4", "4" };
+     std::string buildOptions = format("-D VT=%s%s -D T=%s", typeStr, channelMap[channels], typeStr);
+ 
+     int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
+     int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
  
-     int dst_step1 = dst.cols * dst.elemSize();
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
 +    std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
  
      float pf = static_cast<float>(p);
-     if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
+     if(src.depth() == CV_32F)
 -        args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
 +        args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
      else
 -        args.push_back( make_pair( sizeof(cl_double), (void *)&p ));
 +        args.push_back( std::make_pair( sizeof(cl_double), (void *)&p ));
  
-     openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
+     openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
  }
  
  void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
diff --cc modules/ocl/src/build_warps.cpp
index bc24f5e,40c082b..f0a3203
--- a/modules/ocl/src/build_warps.cpp
+++ b/modules/ocl/src/build_warps.cpp
@@@ -74,24 -74,27 +74,27 @@@ void cv::ocl::buildWarpPlaneMaps(Size /
      int tl_u = dst_roi.tl().x;
      int tl_v = dst_roi.tl().y;
  
-     Context *clCxt = Context::getContext();
-     String kernelName = "buildWarpPlaneMaps";
-     std::vector< std::pair<size_t, const void *> > args;
+     int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
+     int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
  
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map_x.data));
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map_y.data));
 -    vector< pair<size_t, const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&xmap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&ymap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&KRT_mat.data));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&tl_u));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&tl_v));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.cols));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.rows));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_offset));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_offset));
 -    args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
++    std::vector< std::pair<size_t, const void *> > args;
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&KRT_mat.data));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_u));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_v));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.cols));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.rows));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.step));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_y.step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
 +    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
  
-     size_t globalThreads[3] = {map_x.cols, map_x.rows, 1};
-     size_t localThreads[3]  = {32, 8, 1};
-     openCLExecuteKernel(clCxt, &build_warps, kernelName, globalThreads, localThreads, args, -1, -1);
+     size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
+     size_t localThreads[3]  = { 32, 8, 1 };
+ 
+     openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpPlaneMaps", globalThreads, localThreads, args, -1, -1);
  }
  
  //////////////////////////////////////////////////////////////////////////////
@@@ -114,24 -117,27 +117,27 @@@ void cv::ocl::buildWarpCylindricalMaps(
      int tl_u = dst_roi.tl().x;
      int tl_v = dst_roi.tl().y;
  
-     Context *clCxt = Context::getContext();
-     String kernelName = "buildWarpCylindricalMaps";
-     std::vector< std::pair<size_t, const void *> > args;
+     int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
+     int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
  
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map_x.data));
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map_y.data));
 -    vector< pair<size_t, const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&xmap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&ymap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&KR_oclMat.data));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&tl_u));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&tl_v));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.cols));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.rows));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_offset));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_offset));
 -    args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
++    std::vector< std::pair<size_t, const void *> > args;
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&KR_oclMat.data));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_u));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_v));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.cols));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.rows));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.step));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_y.step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
 +    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
  
-     size_t globalThreads[3] = {map_x.cols, map_x.rows, 1};
-     size_t localThreads[3]  = {32, 8, 1};
-     openCLExecuteKernel(clCxt, &build_warps, kernelName, globalThreads, localThreads, args, -1, -1);
+     size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
+     size_t localThreads[3]  = { 32, 8, 1 };
+ 
+     openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpCylindricalMaps", globalThreads, localThreads, args, -1, -1);
  }
  
  //////////////////////////////////////////////////////////////////////////////
@@@ -153,26 -160,30 +160,30 @@@ void cv::ocl::buildWarpSphericalMaps(Si
      int tl_u = dst_roi.tl().x;
      int tl_v = dst_roi.tl().y;
  
-     Context *clCxt = Context::getContext();
-     String kernelName = "buildWarpSphericalMaps";
-     std::vector< std::pair<size_t, const void *> > args;
+     int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
+     int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
  
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map_x.data));
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map_y.data));
 -    vector< pair<size_t, const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&xmap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&ymap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&KR_oclMat.data));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&tl_u));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&tl_v));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.cols));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.rows));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_offset));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_offset));
 -    args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
++    std::vector< std::pair<size_t, const void *> > args;
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
++    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&KR_oclMat.data));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_u));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_v));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.cols));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.rows));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_x.step));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&map_y.step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
 +    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
  
-     size_t globalThreads[3] = {map_x.cols, map_x.rows, 1};
-     size_t localThreads[3]  = {32, 8, 1};
-     openCLExecuteKernel(clCxt, &build_warps, kernelName, globalThreads, localThreads, args, -1, -1);
+     size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
+     size_t localThreads[3]  = { 32, 8, 1 };
+     openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpSphericalMaps", globalThreads, localThreads, args, -1, -1);
  }
  
+ //////////////////////////////////////////////////////////////////////////////
+ // buildWarpAffineMaps
  
  void cv::ocl::buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap)
  {
@@@ -194,29 -205,34 +205,34 @@@
          iM.convertTo(coeffsMat, coeffsMat.type());
      }
  
+     int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
+     int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
+ 
      oclMat coeffsOclMat(coeffsMat.reshape(1, 1));
  
-     Context *clCxt = Context::getContext();
-     String kernelName = "buildWarpAffineMaps";
 -    vector< pair<size_t, const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&xmap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&ymap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&coeffsOclMat.data));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.cols));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.rows));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_offset));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_offset));
 +    std::vector< std::pair<size_t, const void *> > args;
- 
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&coeffsOclMat.data));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.step));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap.step));
- 
-     size_t globalThreads[3] = {xmap.cols, xmap.rows, 1};
-     size_t localThreads[3]  = {32, 8, 1};
-     openCLExecuteKernel(clCxt, &build_warps, kernelName, globalThreads, localThreads, args, -1, -1);
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
+ 
+     size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
+     size_t localThreads[3]  = { 32, 8, 1 };
+     openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpAffineMaps", globalThreads, localThreads, args, -1, -1);
  }
  
+ //////////////////////////////////////////////////////////////////////////////
+ // buildWarpPerspectiveMaps
+ 
  void cv::ocl::buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap)
  {
- 
      CV_Assert(M.rows == 3 && M.cols == 3);
+     CV_Assert(dsize.area() > 0);
  
      xmap.create(dsize, CV_32FC1);
      ymap.create(dsize, CV_32FC1);
@@@ -235,19 -251,21 +251,21 @@@
  
      oclMat coeffsOclMat(coeffsMat.reshape(1, 1));
  
-     Context *clCxt = Context::getContext();
-     String kernelName = "buildWarpPerspectiveMaps";
-     std::vector< std::pair<size_t, const void *> > args;
+     int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
+     int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
  
 -    vector< pair<size_t, const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&xmap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&ymap.data));
 -    args.push_back( make_pair( sizeof(cl_mem), (void *)&coeffsOclMat.data));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.cols));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap.rows));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_step));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&xmap_offset));
 -    args.push_back( make_pair( sizeof(cl_int), (void *)&ymap_offset));
++    std::vector< std::pair<size_t, const void *> > args;
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&coeffsOclMat.data));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
 +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.step));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap.step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
++    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
+ 
+     size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
  
-     size_t globalThreads[3] = {xmap.cols, xmap.rows, 1};
-     size_t localThreads[3]  = {32, 8, 1};
-     openCLExecuteKernel(clCxt, &build_warps, kernelName, globalThreads, localThreads, args, -1, -1);
+     openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpPerspectiveMaps", globalThreads, NULL, args, -1, -1);
  }
diff --cc modules/ocl/src/color.cpp
index eec103a,6e2f403..c93f4bf
--- a/modules/ocl/src/color.cpp
+++ b/modules/ocl/src/color.cpp
@@@ -66,238 -57,423 +57,423 @@@ static void fromRGB_caller(const oclMa
      int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
      int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
  
 -    std::string build_options = format("-D DEPTH_%d", src.depth());
 +    String build_options = format("-D DEPTH_%d", src.depth());
+     if (!additionalOptions.empty())
 -        build_options += additionalOptions;
 -
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
++        build_options = build_options + additionalOptions;
 +
 +    std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&channels));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
  
-     size_t gt[3] = {src.cols, src.rows, 1}, lt[3] = {16, 16, 1};
-     openCLExecuteKernel(src.clCxt, &cvt_color, "RGB2Gray", gt, lt, args, -1, -1, build_options.c_str());
- }
+     if (!data1.empty())
 -        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data1.data ));
++        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data1.data ));
+     if (!data2.empty())
 -        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data ));
++        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
  
- void Gray2RGB_caller(const oclMat &src, oclMat &dst)
- {
-     String build_options = format("-D DEPTH_%d", src.depth());
-     int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
-     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
- 
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
- 
-     size_t gt[3] = {src.cols, src.rows, 1}, lt[3] = {16, 16, 1};
-     openCLExecuteKernel(src.clCxt, &cvt_color, "Gray2RGB", gt, lt, args, -1, -1, build_options.c_str());
+     size_t gt[3] = { dst.cols, dst.rows, 1 }, lt[3] = { 16, 16, 1 };
+     openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
  }
  
- void RGB2YUV_caller(const oclMat &src, oclMat &dst, int bidx)
+ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                          const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
  {
-     int channels = src.oclchannels();
-     String build_options = format("-D DEPTH_%d", src.depth());
 -    std::string build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
++    String build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
+     if (!additionalOptions.empty())
 -        build_options += additionalOptions;
++        build_options = build_options + additionalOptions;
+ 
      int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
      int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
  
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
 +    std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&channels));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
  
-     size_t gt[3] = {src.cols, src.rows, 1}, lt[3] = {16, 16, 1};
-     openCLExecuteKernel(src.clCxt, &cvt_color, "RGB2YUV", gt, lt, args, -1, -1, build_options.c_str());
+     if (!data.empty())
 -        args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data ));
++        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
+ 
+     size_t gt[3] = { dst.cols, dst.rows, 1 }, lt[3] = { 16, 16, 1 };
+     openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
  }
  
- void YUV2RGB_caller(const oclMat &src, oclMat &dst, int bidx)
+ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
  {
-     int channels = src.oclchannels();
 -    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
 -                                       dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
++    String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
++                                  dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
      int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
      int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
  
-     String buildOptions = format("-D DEPTH_%d", src.depth());
- 
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
 +    std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&channels));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
  
-     size_t gt[3] = {src.cols, src.rows, 1}, lt[3] = {16, 16, 1};
-     openCLExecuteKernel(src.clCxt, &cvt_color, "YUV2RGB", gt, lt, args, -1, -1, buildOptions.c_str());
+     size_t gt[3] = { dst.cols, dst.rows, 1 }, lt[3] = { 16, 16, 1 };
+     openCLExecuteKernel(src.clCxt, &cvt_color, "RGB", gt, lt, args, -1, -1, build_options.c_str());
  }
  
- void YUV2RGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx)
+ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
  {
-     String build_options = format("-D DEPTH_%d", src.depth());
-     int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
-     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
 -    std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
 -                                       src.depth(), greenbits, dst.channels());
++    String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
++                                  src.depth(), greenbits, dst.channels());
+     int src_offset = src.offset >> 1, src_step = src.step >> 1;
+     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();
  
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
 +    std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
  
-     size_t gt[3] = {dst.cols / 2, dst.rows / 2, 1}, lt[3] = {16, 16, 1};
-     openCLExecuteKernel(src.clCxt, &cvt_color, "YUV2RGBA_NV12", gt, lt, args, -1, -1, build_options.c_str());
+     size_t gt[3] = { dst.cols, dst.rows, 1 }, lt[3] = { 16, 16, 1 };
+     openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
  }
  
- void RGB2YCrCb_caller(const oclMat &src, oclMat &dst, int bidx)
+ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
  {
-     int channels = src.oclchannels();
-     String build_options = format("-D DEPTH_%d", src.depth());
-     int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
-     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
 -    std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
 -                                       src.depth(), greenbits, src.channels());
++    String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
++                                  src.depth(), greenbits, src.channels());
+     int src_offset = (int)src.offset, src_step = (int)src.step;
+     int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;
  
 -    vector<pair<size_t , const void *> > args;
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
 -    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
 -    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset ));
 +    std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
++    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&channels));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
 +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
 +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
  
-     size_t gt[3] = {src.cols, src.rows, 1}, lt[3] = {16, 16, 1};
-     openCLExecuteKernel(src.clCxt, &cvt_color, "RGB2YCrCb", gt, lt, args, -1, -1, build_options.c_str());
+     size_t gt[3] = { dst.cols, dst.rows, 1 }, lt[3] = { 16, 16, 1 };
+     openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
  }
  
- void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
+ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
  {
      Size sz = src.size();
-     int scn = src.oclchannels(), depth = src.depth(), bidx;
+     int scn = src.channels(), depth = src.depth(), bidx;
  
      CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
  
      switch (code)
      {
-         /*
-         case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
-         case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
-         case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
-         case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
-         case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
-         case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
-         */
-     case COLOR_BGR2GRAY:
-     case COLOR_BGRA2GRAY:
-     case COLOR_RGB2GRAY:
-     case COLOR_RGBA2GRAY:
 -    case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
 -    case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
++    case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
++    case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
+     {
+         CV_Assert(scn == 3 || scn == 4);
 -        dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
 -        bool reverse = !(code == CV_BGR2BGRA || code == CV_BGRA2BGR);
++        dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
++        bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
+         dst.create(sz, CV_MAKE_TYPE(depth, dcn));
+         RGB_caller(src, dst, reverse);
+         break;
+     }
 -    case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
 -    case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
++    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
++    case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
+     {
+         CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
 -        bidx = code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
 -            code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2;
 -        int greenbits = code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
 -            code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5;
++        bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
++            code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
++        int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
++            code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
+         dst.create(sz, CV_8UC2);
+         toRGB5x5_caller(src, dst, bidx, greenbits, "RGB2RGB5x5");
+         break;
+     }
 -    case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
 -    case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
++    case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
++    case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
+     {
 -        dcn = code == CV_BGR5652BGRA || code == CV_BGR5552BGRA || code == CV_BGR5652RGBA || code == CV_BGR5552RGBA ? 4 : 3;
++        dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
+         CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
 -        bidx = code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
 -            code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2;
 -        int greenbits = code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
 -            code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5;
++        bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
++            code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
++        int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
++            code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
+         dst.create(sz, CV_MAKETYPE(depth, dcn));
+         fromRGB5x5_caller(src, dst, bidx, greenbits, "RGB5x52RGB");
+         break;
+     }
 -    case CV_BGR5652GRAY: case CV_BGR5552GRAY:
++    case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
+     {
+         CV_Assert(scn == 2 && depth == CV_8U);
+         dst.create(sz, CV_8UC1);
 -        int greenbits = code == CV_BGR5652GRAY ? 6 : 5;
++        int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
+         fromRGB5x5_caller(src, dst, -1, greenbits, "BGR5x52Gray");
+         break;
+     }
 -    case CV_GRAY2BGR565: case CV_GRAY2BGR555:
++    case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
+     {
+         CV_Assert(scn == 1 && depth == CV_8U);
+         dst.create(sz, CV_8UC2);
 -        int greenbits = code == CV_GRAY2BGR565 ? 6 : 5;
++        int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
+         toRGB5x5_caller(src, dst, -1, greenbits, "Gray2BGR5x5");
+         break;
+     }
 -    case CV_RGB2GRAY: case CV_BGR2GRAY: case CV_RGBA2GRAY: case CV_BGRA2GRAY:
++    case COLOR_RGB2GRAY: case COLOR_BGR2GRAY: case COLOR_RGBA2GRAY: case COLOR_BGRA2GRAY:
      {
          CV_Assert(scn == 3 || scn == 4);
 -        bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
 +        bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
          dst.create(sz, CV_MAKETYPE(depth, 1));
-         RGB2Gray_caller(src, dst, bidx);
+         fromRGB_caller(src, dst, bidx, "RGB2Gray");
          break;
      }
-     case COLOR_GRAY2BGR:
-     case COLOR_GRAY2BGRA:
 -    case CV_GRAY2BGR: case CV_GRAY2BGRA:
++    case COLOR_GRAY2BGR: case COLOR_GRAY2BGRA:
      {
          CV_Assert(scn == 1);
 -        dcn  = code == CV_GRAY2BGRA ? 4 : 3;
 +        dcn  = code == COLOR_GRAY2BGRA ? 4 : 3;
          dst.create(sz, CV_MAKETYPE(depth, dcn));
-         Gray2RGB_caller(src, dst);
+         toRGB_caller(src, dst, 0, "Gray2RGB");
          break;
      }
-     case COLOR_BGR2YUV:
-     case COLOR_RGB2YUV:
 -    case CV_BGR2YUV: case CV_RGB2YUV:
++    case COLOR_BGR2YUV: case COLOR_RGB2YUV:
      {
          CV_Assert(scn == 3 || scn == 4);
-         bidx = code == COLOR_RGB2YUV ? 0 : 2;
 -        bidx = code == CV_BGR2YUV ? 0 : 2;
++        bidx = code == COLOR_BGR2YUV ? 0 : 2;
          dst.create(sz, CV_MAKETYPE(depth, 3));
-         RGB2YUV_caller(src, dst, bidx);
+         fromRGB_caller(src, dst, bidx, "RGB2YUV");
          break;
      }
-     case COLOR_YUV2BGR:
-     case COLOR_YUV2RGB:
 -    case CV_YUV2BGR: case CV_YUV2RGB:
++    case COLOR_YUV2BGR: case COLOR_YUV2RGB:
      {
-         CV_Assert(scn == 3 || scn == 4);
-         bidx = code == COLOR_YUV2RGB ? 0 : 2;
-         dst.create(sz, CV_MAKETYPE(depth, 3));
-         YUV2RGB_caller(src, dst, bidx);
+         if( dcn <= 0 )
+             dcn = 3;
+         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
 -        bidx = code == CV_YUV2BGR ? 0 : 2;
++        bidx = code == COLOR_YUV2BGR ? 0 : 2;
+         dst.create(sz, CV_MAKETYPE(depth, dcn));
+         toRGB_caller(src, dst, bidx, "YUV2RGB");
          break;
      }
-     case COLOR_YUV2RGB_NV12:
-     case COLOR_YUV2BGR_NV12:
-     case COLOR_YUV2RGBA_NV12:
-     case COLOR_YUV2BGRA_NV12:
 -    case CV_YUV2RGB_NV12: case CV_YUV2BGR_NV12:
 -    case CV_YUV2RGBA_NV12: case CV_YUV2BGRA_NV12:
++    case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12:
++    case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12:
      {
          CV_Assert(scn == 1);
          CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
-         dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
 -        dcn = code == CV_YUV2BGRA_NV12 || code == CV_YUV2RGBA_NV12 ? 4 : 3;
 -        bidx = code == CV_YUV2BGRA_NV12 || code == CV_YUV2BGR_NV12 ? 0 : 2;
++        dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
 +        bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2;
  
          Size dstSz(sz.width, sz.height * 2 / 3);
          dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-         YUV2RGB_NV12_caller(src, dst, bidx);
+         toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
          break;
      }
-     case COLOR_BGR2YCrCb:
-     case COLOR_RGB2YCrCb:
 -    case CV_BGR2YCrCb: case CV_RGB2YCrCb:
++    case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
      {
          CV_Assert(scn == 3 || scn == 4);
 -        bidx = code == CV_BGR2YCrCb ? 0 : 2;
 +        bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
          dst.create(sz, CV_MAKETYPE(depth, 3));
-         RGB2YCrCb_caller(src, dst, bidx);
+         fromRGB_caller(src, dst, bidx, "RGB2YCrCb");
          break;
      }
-     case COLOR_YCrCb2BGR:
-     case COLOR_YCrCb2RGB:
 -    case CV_YCrCb2BGR: case CV_YCrCb2RGB:
++    case COLOR_YCrCb2BGR: case COLOR_YCrCb2RGB:
      {
+         if( dcn <= 0 )
+             dcn = 3;
+         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
 -        bidx = code == CV_YCrCb2BGR ? 0 : 2;
++        bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
+         dst.create(sz, CV_MAKETYPE(depth, dcn));
+         toRGB_caller(src, dst, bidx, "YCrCb2RGB");
          break;
      }
-     /*
-     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
-     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
-     case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
 -    case CV_BGR2XYZ: case CV_RGB2XYZ:
 +    case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
+     {
+         CV_Assert(scn == 3 || scn == 4);
 -        bidx = code == CV_BGR2XYZ ? 0 : 2;
++        bidx = code == COLOR_BGR2XYZ ? 0 : 2;
+         dst.create(sz, CV_MAKE_TYPE(depth, 3));
+ 
+         Mat c;
+         if (depth == CV_32F)
+         {
+             float coeffs[] =
+             {
+                 0.412453f, 0.357580f, 0.180423f,
+                 0.212671f, 0.715160f, 0.072169f,
+                 0.019334f, 0.119193f, 0.950227f
+             };
+             if (bidx == 0)
+             {
+                 std::swap(coeffs[0], coeffs[2]);
+                 std::swap(coeffs[3], coeffs[5]);
+                 std::swap(coeffs[6], coeffs[8]);
+             }
+             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
+         }
+         else
+         {
+             int coeffs[] =
+             {
+                 1689,    1465,    739,
+                 871,     2929,    296,
+                 79,      488,     3892
+             };
+             if (bidx == 0)
+             {
+                 std::swap(coeffs[0], coeffs[2]);
+                 std::swap(coeffs[3], coeffs[5]);
+                 std::swap(coeffs[6], coeffs[8]);
+             }
+             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
+         }
+         oclMat oclCoeffs(c);
+ 
+         fromRGB_caller(src, dst, bidx, "RGB2XYZ", "", oclCoeffs);
+         break;
+     }
 -    case CV_XYZ2BGR: case CV_XYZ2RGB:
 +    case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
+     {
+         if (dcn <= 0)
+             dcn = 3;
+         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
 -        bidx = code == CV_XYZ2BGR ? 0 : 2;
++        bidx = code == COLOR_XYZ2BGR ? 0 : 2;
+         dst.create(sz, CV_MAKE_TYPE(depth, dcn));
+ 
+         Mat c;
+         if (depth == CV_32F)
+         {
+             float coeffs[] =
+             {
+                 3.240479f, -1.53715f, -0.498535f,
+                 -0.969256f, 1.875991f, 0.041556f,
+                 0.055648f, -0.204043f, 1.057311f
+             };
+             if (bidx == 0)
+             {
+                 std::swap(coeffs[0], coeffs[6]);
+                 std::swap(coeffs[1], coeffs[7]);
+                 std::swap(coeffs[2], coeffs[8]);
+             }
+             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
+         }
+         else
+         {
+             int coeffs[] =
+             {
+                 13273,  -6296,  -2042,
+                 -3970,   7684,    170,
+                   228,   -836,   4331
+             };
+             if (bidx == 0)
+             {
+                 std::swap(coeffs[0], coeffs[6]);
+                 std::swap(coeffs[1], coeffs[7]);
+                 std::swap(coeffs[2], coeffs[8]);
+             }
+             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
+         }
+         oclMat oclCoeffs(c);
+ 
+         toRGB_caller(src, dst, bidx, "XYZ2RGB", "", oclCoeffs);
+         break;
+     }
 -    case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
 -    case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
 +    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
 +    case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
+     {
+         CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
 -        bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
 -            code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
 -        int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
 -            code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
 -        bool is_hsv = code == CV_BGR2HSV || code == CV_RGB2HSV || code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL;
++        bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
++            code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
++        int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
++            code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
++        bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
+         dst.create(sz, CV_MAKETYPE(depth, 3));
+         std::string kernelName = std::string("RGB2") + (is_hsv ? "HSV" : "HLS");
+ 
+         if (is_hsv && depth == CV_8U)
+         {
+             static oclMat sdiv_data;
+             static oclMat hdiv_data180;
+             static oclMat hdiv_data256;
+             static int sdiv_table[256];
+             static int hdiv_table180[256];
+             static int hdiv_table256[256];
+             static volatile bool initialized180 = false, initialized256 = false;
+             volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
+ 
+             if (!initialized)
+             {
+                 int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
+                 oclMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
+ 
+                 sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
+ 
+                 int v = 255 << hsv_shift;
+                 if (!initialized180 && !initialized256)
+                 {
+                     for(int i = 1; i < 256; i++ )
+                         sdiv_table[i] = saturate_cast<int>(v/(1.*i));
+                     sdiv_data.upload(Mat(1, 256, CV_32SC1, sdiv_table));
+                 }
+ 
+                 v = hrange << hsv_shift;
+                 for (int i = 1; i < 256; i++ )
+                     hdiv_table[i] = saturate_cast<int>(v/(6.*i));
+ 
+                 hdiv_data.upload(Mat(1, 256, CV_32SC1, hdiv_table));
+                 initialized = true;
+             }
+ 
+             fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
+             return;
+         }
+ 
+         fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
+         break;
+     }
 -    case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
 -    case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
 +    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
 +    case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
-     */
+     {
+         if (dcn <= 0)
+             dcn = 3;
+         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
 -        bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
 -            code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
 -        int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
 -            code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
 -        bool is_hsv = code == CV_HSV2BGR || code == CV_HSV2RGB ||
 -                code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL;
++        bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
++            code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
++        int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
++            code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
++        bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
++                code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
+ 
+         dst.create(sz, CV_MAKETYPE(depth, dcn));
+ 
+         std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
+         toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
+         break;
+     }
 -    case CV_RGBA2mRGBA: case CV_mRGBA2RGBA:
++    case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
+         {
+             CV_Assert(scn == 4 && depth == CV_8U);
+             dst.create(sz, CV_MAKETYPE(depth, 4));
 -            std::string kernelName = code == CV_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA";
++            std::string kernelName = code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA";
+ 
+             fromRGB_caller(src, dst, 0, kernelName);
+             break;
+         }
      default:
 -        CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
 +        CV_Error(Error::StsBadFlag, "Unknown/unsupported color conversion code" );
      }
  }
- }
  
  void cv::ocl::cvtColor(const oclMat &src, oclMat &dst, int code, int dcn)
  {
diff --cc modules/ocl/src/haar.cpp
index fd67daf,deff867..c10d089
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@@ -996,10 -1011,15 +1011,15 @@@ void OclCascadeClassifier::detectMultiS
          int n_factors = 0;
          oclMat gsum;
          oclMat gsqsum;
-         cv::ocl::integral(gimg, gsum, gsqsum);
+         oclMat gsqsum_t;
+         cv::ocl::integral(gimg, gsum, gsqsum_t);
+         if(gsqsum_t.depth() == CV_64F)
+             gsqsum_t.convertTo(gsqsum, CV_32FC1);
+         else
+             gsqsum = gsqsum_t;
          CvSize sz;
 -        vector<CvSize> sizev;
 -        vector<float> scalev;
 +        std::vector<CvSize> sizev;
 +        std::vector<float> scalev;
          gpuSetHaarClassifierCascade(cascade);
          gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
          stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
diff --cc modules/ocl/src/imgproc.cpp
index 1861c98,141325b..549e9c7
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@@ -217,27 -214,27 +215,27 @@@ namespace c
              const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
                                     "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
  
 -            string kernelName = "remap";
 +            String kernelName = "remap";
              if (map1.type() == CV_32FC2 && map2.empty())
 -                kernelName += "_32FC2";
 +                kernelName = kernelName + "_32FC2";
              else if (map1.type() == CV_16SC2)
              {
 -                kernelName += "_16SC2";
 +                kernelName = kernelName + "_16SC2";
                  if (!map2.empty())
 -                    kernelName += "_16UC1";
 +                    kernelName = kernelName + "_16UC1";
              }
              else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
 -                kernelName += "_2_32FC1";
 +                kernelName = kernelName + "_2_32FC1";
              else
 -                CV_Error(CV_StsBadArg, "Unsupported map types");
 +                CV_Error(Error::StsBadArg, "Unsupported map types");
  
              int ocn = dst.oclchannels();
-             size_t localThreads[3] = { 16, 16, 1};
-             size_t globalThreads[3] = { dst.cols, dst.rows, 1};
+             size_t localThreads[3] = { 256, 1, 1 };
+             size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
  
              Mat scalar(1, 1, CV_MAKE_TYPE(dst.depth(), ocn), borderValue);
 -            std::string buildOptions = format("-D %s -D %s -D T=%s%s", interMap[interpolation],
 -                                              borderMap[borderType], typeMap[src.depth()], channelMap[ocn]);
 +            String buildOptions = format("-D %s -D %s -D T=%s%s", interMap[interpolation],
 +                                         borderMap[borderType], typeMap[src.depth()], channelMap[ocn]);
  
              if (interpolation != INTER_NEAREST)
              {
@@@ -312,57 -304,30 +306,30 @@@
                  glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
              }
              else
-                 glbSizeX = dst.cols % blkSizeX == 0 && dst.cols != 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
+                 glbSizeX = dst.cols;
  
-             size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
-             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
-             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
+             size_t globalThreads[3] = { glbSizeX, dst.rows, 1 };
+             size_t localThreads[3] = { blkSizeX, blkSizeY, 1 };
  
              std::vector< std::pair<size_t, const void *> > args;
-             if (interpolation == INTER_NEAREST)
-             {
-                 args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-                 args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-                 if (src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
-                 {
-                     args.push_back( std::make_pair(sizeof(cl_double), (void *)&ifx_d));
-                     args.push_back( std::make_pair(sizeof(cl_double), (void *)&ify_d));
-                 }
-                 else
-                 {
-                     args.push_back( std::make_pair(sizeof(cl_float), (void *)&ifx));
-                     args.push_back( std::make_pair(sizeof(cl_float), (void *)&ify));
-                 }
-             }
-             else
-             {
-                 args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-                 args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-                 args.push_back( std::make_pair(sizeof(cl_float), (void *)&ifx));
-                 args.push_back( std::make_pair(sizeof(cl_float), (void *)&ify));
-             }
 -            args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 -            args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 -            args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 -            args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
 -            args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
++            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
++            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
++            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
++            args.push_back( std::make_pair(sizeof(cl_float), (void *)&ifx));
++            args.push_back( std::make_pair(sizeof(cl_float), (void *)&ify));
  
-             openCLExecuteKernel(clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
+             openCLExecuteKernel(src.clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args,
+                                 ocn, depth, buildOption.c_str());
          }
  
-         void resize(const oclMat &src, oclMat &dst, Size dsize,
-                     double fx, double fy, int interpolation)
+         void resize(const oclMat &src, oclMat &dst, Size dsize, double fx, double fy, int interpolation)
          {
              CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4
                        || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4);
@@@ -807,42 -781,41 +783,41 @@@
              t_sum.create(src.cols, src.rows, type);
              sum.create(h, w, type);
  
-             t_sqsum.create(src.cols, src.rows, CV_32FC1);
-             sqsum.create(h, w, CV_32FC1);
- 
-             int sum_offset = sum.offset / vlen;
-             int sqsum_offset = sqsum.offset / vlen;
+             int sum_offset = sum.offset / sum.elemSize();
+             int sqsum_offset = sqsum.offset / sqsum.elemSize();
  
 -            vector<pair<size_t , const void *> > args;
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sqsum.step));
 +            std::vector<std::pair<size_t , const void *> > args;
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sqsum.step));
              size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
-             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
+             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, sdepth, build_option);
  
              args.clear();
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sqsum.step));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum.step));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
++            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sqsum.step));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum.step));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
              size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
-             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
+             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, sdepth, build_option);
          }
  
-         void integral(const oclMat &src, oclMat &sum)
+         void integral(const oclMat &src, oclMat &sum, int sdepth)
          {
              CV_Assert(src.type() == CV_8UC1);
              int vlen = 4;
@@@ -859,28 -835,28 +837,28 @@@
              sum.create(h, w, type);
  
              int sum_offset = sum.offset / vlen;
 -            vector<pair<size_t , const void *> > args;
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
 +            std::vector<std::pair<size_t , const void *> > args;
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
              size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
-             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
+             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, sdepth);
  
              args.clear();
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 -            args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
 -            args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step));
 +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
              size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
-             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);
+             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, sdepth);
          }
  
          /////////////////////// corner //////////////////////////////
diff --cc modules/ocl/src/split_merge.cpp
index 073a7a7,400a055..0bd5eb7
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@@@ -249,8 -250,9 +249,9 @@@ namespace c
                          && devInfo.deviceType == CVCL_DEVICE_TYPE_CPU
                          && devInfo.platform->platformVendor.find("Intel") != std::string::npos
                          && (devInfo.deviceVersion.find("Build 56860") != std::string::npos
-                             || devInfo.deviceVersion.find("Build 76921") != std::string::npos))
+                             || devInfo.deviceVersion.find("Build 76921") != std::string::npos
+                             || devInfo.deviceVersion.find("Build 78712") != std::string::npos))
 -                    build_options += " -D BYPASS_VSTORE=true";
 +                    build_options = build_options + " -D BYPASS_VSTORE=true";
  
                  size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 };
                  openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str());
diff --cc modules/ocl/test/test_color.cpp
index cc7843d,7c4fe39..c4641d4
--- a/modules/ocl/test/test_color.cpp
+++ b/modules/ocl/test/test_color.cpp
@@@ -44,14 -44,10 +44,13 @@@
  //M*/
  
  #include "test_precomp.hpp"
 +
 +using namespace cv;
 +
  #ifdef HAVE_OPENCL
  
- namespace
- {
  using namespace testing;
+ using namespace cv;
  
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
  // cvtColor
@@@ -203,26 -288,17 +291,17 @@@ struct CvtColor_YUV420 
      }
  };
  
- OCL_TEST_P(CvtColor_YUV420, YUV2RGBA_NV12)
- {
-     doTest(1, 4, COLOR_YUV2RGBA_NV12);
- };
- 
- OCL_TEST_P(CvtColor_YUV420, YUV2BGRA_NV12)
- {
-     doTest(1, 4, COLOR_YUV2BGRA_NV12);
- };
 -OCL_TEST_P(CvtColor_YUV420, YUV2RGBA_NV12) { doTest(1, 4, CV_YUV2RGBA_NV12); }
 -OCL_TEST_P(CvtColor_YUV420, YUV2BGRA_NV12) { doTest(1, 4, CV_YUV2BGRA_NV12); }
 -OCL_TEST_P(CvtColor_YUV420, YUV2RGB_NV12) { doTest(1, 3, CV_YUV2RGB_NV12); }
 -OCL_TEST_P(CvtColor_YUV420, YUV2BGR_NV12) { doTest(1, 3, CV_YUV2BGR_NV12); }
++OCL_TEST_P(CvtColor_YUV420, YUV2RGBA_NV12) { doTest(1, 4, COLOR_YUV2RGBA_NV12); }
++OCL_TEST_P(CvtColor_YUV420, YUV2BGRA_NV12) { doTest(1, 4, COLOR_YUV2BGRA_NV12); }
++OCL_TEST_P(CvtColor_YUV420, YUV2RGB_NV12) { doTest(1, 3, COLOR_YUV2RGB_NV12); }
++OCL_TEST_P(CvtColor_YUV420, YUV2BGR_NV12) { doTest(1, 3, COLOR_YUV2BGR_NV12); }
  
- OCL_TEST_P(CvtColor_YUV420, YUV2RGB_NV12)
- {
-     doTest(1, 3, COLOR_YUV2RGB_NV12);
- };
  
- OCL_TEST_P(CvtColor_YUV420, YUV2BGR_NV12)
- {
-     doTest(1, 3, COLOR_YUV2BGR_NV12);
- };
+ INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor8u,
+                             testing::Combine(testing::Values(MatDepth(CV_8U)), Bool()));
  
+ INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor8u32f,
+                             testing::Combine(testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)), Bool()));
  
  INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor,
                              testing::Combine(
diff --cc samples/cpp/Qt_sample/main.cpp
index 16345d5,8794539..92bc2b5
--- a/samples/cpp/Qt_sample/main.cpp
+++ b/samples/cpp/Qt_sample/main.cpp
@@@ -4,15 -4,7 +4,11 @@@
  
  #include <iostream>
  #include <vector>
- 
 -#include <opencv2/highgui/highgui.hpp>
 +#include <opencv2/core/core_c.h>
- #include <opencv2/imgproc/imgproc_c.h>
- #include <opencv2/legacy/compat.hpp>
 +#include <opencv2/calib3d/calib3d_c.h>
- 
 +#include <opencv2/imgproc.hpp>
 +#include <opencv2/highgui.hpp>
- #include <opencv2/calib3d.hpp>
++#include <opencv2/legacy/compat.hpp>
  
  #if defined WIN32 || defined _WIN32 || defined WINCE
      #include <windows.h>
@@@ -116,19 -110,19 +112,16 @@@ static void initPOSIT(std::vector<CvPoi
      modelPoints->push_back(cvPoint3D32f(0.0f, CUBE_SIZE, 0.0f));
  }
  
- static void foundCorners(vector<CvPoint2D32f> *srcImagePoints,IplImage* source, IplImage* grayImage)
+ static void foundCorners(vector<CvPoint2D32f> *srcImagePoints, const Mat& source, Mat& grayImage)
  {
-     cvCvtColor(source,grayImage,CV_RGB2GRAY);
-     cvSmooth( grayImage, grayImage,CV_GAUSSIAN,11);
-     cvNormalize(grayImage, grayImage, 0, 255, CV_MINMAX);
-     cvThreshold( grayImage, grayImage, 26, 255, CV_THRESH_BINARY_INV);//25
- 
-     Mat MgrayImage = cv::cvarrToMat(grayImage);
-     //For debug
-     //MgrayImage = MgrayImage.clone();//deep copy
+     cvtColor(source, grayImage, COLOR_RGB2GRAY);
+     GaussianBlur(grayImage, grayImage, Size(11,11), 0, 0);
+     normalize(grayImage, grayImage, 0, 255, NORM_MINMAX);
+     threshold(grayImage, grayImage, 26, 255, THRESH_BINARY_INV); //25
+ 
 -    Mat MgrayImage = grayImage;
 -    //For debug
 -    //MgrayImage = MgrayImage.clone();//deep copy
      vector<vector<Point> > contours;
      vector<Vec4i> hierarchy;
-     findContours(MgrayImage, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_NONE);
 -    findContours(MgrayImage, contours, hierarchy, CV_RETR_EXTERNAL, CV_CHAIN_APPROX_NONE);
++    findContours(grayImage, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_NONE);
  
      Point p;
      vector<CvPoint2D32f> srcImagePoints_temp(4,cvPoint2D32f(0,0));
@@@ -199,7 -193,7 +192,7 @@@
              ss.str("");
  
              //new coordinate system in the middle of the frame and reversed (camera coordinate system)
--            srcImagePoints->at(i) = cvPoint2D32f(srcImagePoints_temp.at(i).x-source->width/2,source->height/2-srcImagePoints_temp.at(i).y);
++            srcImagePoints->at(i) = cvPoint2D32f(srcImagePoints_temp.at(i).x-source.cols/2,source.rows/2-srcImagePoints_temp.at(i).y);
          }
      }
  
@@@ -232,15 -226,14 +225,14 @@@ int main(void
      VideoCapture video("cube4.avi");
      CV_Assert(video.isOpened());
  
-     Mat frame; video >> frame;
+     Mat source, grayImage;
  
-     IplImage* grayImage = cvCreateImage(frame.size(),8,1);
+     video >> source;
  
 -    namedWindow("original", WINDOW_AUTOSIZE | CV_WINDOW_FREERATIO);
 -    namedWindow("POSIT", WINDOW_AUTOSIZE | CV_WINDOW_FREERATIO);
 +    namedWindow("original", WINDOW_AUTOSIZE | WINDOW_FREERATIO);
 +    namedWindow("POSIT", WINDOW_AUTOSIZE | WINDOW_FREERATIO);
      displayOverlay("POSIT", "We lost the 4 corners' detection quite often (the red circles disappear). This demo is only to illustrate how to use OpenGL callback.\n -- Press ESC to exit.", 10000);
-     //For debug
-     //cvNamedWindow("tempGray",CV_WINDOW_AUTOSIZE);
+ 
      float OpenGLMatrix[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
      setOpenGlDrawCallback("POSIT",on_opengl,OpenGLMatrix);
  
@@@ -259,20 -252,17 +251,17 @@@
  
      while(waitKey(33) != 27)
      {
-         video >> frame;
-         imshow("original", frame);
+         video >> source;
+         imshow("original",source);
  
-         IplImage source = frame;
-         foundCorners(&srcImagePoints, &source, grayImage);
 -        foundCorners(&srcImagePoints,source,grayImage);
++        foundCorners(&srcImagePoints, source, grayImage);
          cvPOSIT( positObject, &srcImagePoints[0], FOCAL_LENGTH, criteria, rotation_matrix, translation_vector );
          createOpenGLMatrixFrom(OpenGLMatrix,rotation_matrix,translation_vector);
  
-         imshow("POSIT", frame);
-         //For debug
-         //cvShowImage("tempGray",grayImage);
+         imshow("POSIT",source);
  
 -        if (VideoCapture::get(video,CV_CAP_PROP_POS_AVI_RATIO)>0.99)
 -            VideoCapture::get(video,CV_CAP_PROP_POS_AVI_RATIO,0);
 +        if (video.get(CAP_PROP_POS_AVI_RATIO) > 0.99)
 +            video.set(CAP_PROP_POS_AVI_RATIO, 0);
      }
  
      destroyAllWindows();
diff --cc samples/cpp/segment_objects.cpp
index d44b035,6438b89..852fa15
--- a/samples/cpp/segment_objects.cpp
+++ b/samples/cpp/segment_objects.cpp
@@@ -95,9 -94,7 +95,7 @@@ int main(int argc, char** argv
          cap >> tmp_frame;
          if( !tmp_frame.data )
              break;
 -        bgsubtractor(tmp_frame, bgmask, update_bg_model ? -1 : 0);
 +        bgsubtractor->apply(tmp_frame, bgmask, update_bg_model ? -1 : 0);
-         //CvMat _bgmask = bgmask;
-         //cvSegmentFGMask(&_bgmask);
          refineSegments(tmp_frame, bgmask, out_frame);
          imshow("video", tmp_frame);
          imshow("segmented", out_frame);
diff --cc samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
index 395b4b3,e38003e..499eb45
--- a/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/hull_demo.cpp
@@@ -59,10 -59,10 +59,10 @@@ void thresh_callback(int, void* 
    threshold( src_gray, threshold_output, thresh, 255, THRESH_BINARY );
  
    /// Find contours
 -  findContours( threshold_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
 +  findContours( threshold_output, contours, hierarchy, RETR_TREE, CHAIN_APPROX_SIMPLE, Point(0, 0) );
  
    /// Find the convex hull object for each contour
-  vector<vector<Point> >hull( contours.size() );
+   vector<vector<Point> >hull( contours.size() );
    for( size_t i = 0; i < contours.size(); i++ )
       {   convexHull( Mat(contours[i]), hull[i], false ); }
  
diff --cc samples/cpp/tutorial_code/introduction/windows_visual_studio_Opencv/Test.cpp
index b5470ef,240a2e1..a2597a7
--- a/samples/cpp/tutorial_code/introduction/windows_visual_studio_Opencv/Test.cpp
+++ b/samples/cpp/tutorial_code/introduction/windows_visual_studio_Opencv/Test.cpp
@@@ -76,11 -76,11 +76,11 @@@ int main(int argc, char *argv[]
      // Windows
              namedWindow(WIN_RF, WINDOW_AUTOSIZE );
              namedWindow(WIN_UT, WINDOW_AUTOSIZE );
-             moveWindow(WIN_RF, 400       ,            0); //750,  2 (bernat =0)
-             moveWindow(WIN_UT, refS.width,            0); //1500, 2
+             moveWindow(WIN_RF, 400       ,            0);		 //750,  2 (bernat =0)
+             moveWindow(WIN_UT, refS.width,            0);		 //1500, 2
  
      cout << "Frame resolution: Width=" << refS.width << "  Height=" << refS.height
 -         << " of nr#: " << captRefrnc.get(CV_CAP_PROP_FRAME_COUNT) << endl;
 +         << " of nr#: " << captRefrnc.get(CAP_PROP_FRAME_COUNT) << endl;
  
      cout << "PSNR trigger value " <<
            setiosflags(ios::fixed) << setprecision(3) << psnrTriggerValue << endl;
diff --cc samples/cpp/video_dmtx.cpp
index 88dc28a,01eadd4..e410f7c
--- a/samples/cpp/video_dmtx.cpp
+++ b/samples/cpp/video_dmtx.cpp
@@@ -52,8 -52,8 +52,8 @@@ namespac
              if (frame.empty())
                  break;
              cv::Mat gray;
-             cv::cvtColor(frame,gray, COLOR_RGB2GRAY);
+             cv::cvtColor(frame,gray,COLOR_RGB2GRAY);
 -            vector<string> codes;
 +            vector<String> codes;
              Mat corners;
              findDataMatrix(gray, codes, corners);
              drawDataMatrixCodes(frame, codes, corners);