Merge remote-tracking branch 'origin/2.4'

author Roman Donchenko <roman.donchenko@itseez.com>

Tue, 4 Jun 2013 14:31:51 +0000 (18:31 +0400)

committer Roman Donchenko <roman.donchenko@itseez.com>

Wed, 5 Jun 2013 11:42:07 +0000 (15:42 +0400)
author Roman Donchenko <roman.donchenko@itseez.com>
Tue, 4 Jun 2013 14:31:51 +0000 (18:31 +0400)
committer Roman Donchenko <roman.donchenko@itseez.com>
Wed, 5 Jun 2013 11:42:07 +0000 (15:42 +0400)
diff --cc CMakeLists.txt
Simple merge
diff --cc apps/traincascade/boost.cpp
Simple merge
diff --cc modules/calib3d/src/solvepnp.cpp
Simple merge
diff --cc modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
Simple merge
diff --cc modules/core/include/opencv2/core/cuda/functional.hpp
Simple merge
diff --cc modules/core/include/opencv2/core/cuda/utility.hpp
Simple merge
diff --cc modules/features2d/src/detectors.cpp

index 8752dd2,a1e389a..c20d573
--- 1/modules/features2d/src/detectors.cpp
--- 2/modules/features2d/src/detectors.cpp
+++ b/modules/features2d/src/detectors.cpp
@@@ -229,15 -229,12 +227,12 @@@ private
   
   public:
   
-     GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask, std::vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols
- #ifdef HAVE_TBB
-         , tbb::mutex* kptLock
- #endif
-         ) : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
-             keypoints_(keypoints), image_(image), mask_(mask), detector_(detector)
- #ifdef HAVE_TBB
-             , kptLock_(kptLock)
- #endif
+     GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask,
- -                                      vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols,
++                                      std::vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols,
+                                       cv::Mutex* kptLock)
+         : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
+           keypoints_(keypoints), image_(image), mask_(mask), detector_(detector),
+           kptLock_(kptLock)
       {
       }
   
diff --cc modules/gpu/CMakeLists.txt
Simple merge
diff --cc modules/gpu/src/calib3d.cpp

index abcc342,b84f09d..5a4e94c
--- 1/modules/gpu/src/calib3d.cpp
--- 2/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@@ -160,10 -161,10 +160,10 @@@ namespac
                     num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                     transl_vectors(transl_vectors_) {}
   
-         void operator()(const BlockedRange& range) const
+         void operator()(const Range& range) const
           {
               // Input data for generation of the current hypothesis
- -            vector<int> subset_indices(subset_size);
+ +            std::vector<int> subset_indices(subset_size);
               Mat_<Point3f> object_subset(1, subset_size);
               Mat_<Point2f> image_subset(1, subset_size);
   
diff --cc modules/gpu/src/cuda/calib3d.cu
Simple merge
diff --cc modules/gpu/src/cuda/canny.cu
Simple merge
diff --cc modules/gpu/src/cuda/element_operations.cu
Simple merge
diff --cc modules/gpu/src/matrix_reductions.cpp
Simple merge
diff --cc modules/gpu/test/test_core.cpp
Simple merge
diff --cc modules/imgproc/doc/miscellaneous_transformations.rst
Simple merge
diff --cc modules/imgproc/src/color.cpp
Simple merge
diff --cc modules/imgproc/src/distransform.cpp

index b8ab50c,d3e6f90..8ebb206
--- 1/modules/imgproc/src/distransform.cpp
--- 2/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@@ -438,13 -434,18 +438,13 @@@ static void getDistanceTransformMask( i
           metrics[2] = 2.1969f;
           break;
       default:
- -        return CV_BADRANGE_ERR;
+ +        CV_Error(CV_StsBadArg, "Uknown metric type");
       }
- -
- -    return CV_OK;
   }
   
- struct DTColumnInvoker
- -namespace cv
- -{
- -
+ struct DTColumnInvoker : ParallelLoopBody
   {
- -    DTColumnInvoker( const CvMat* _src, CvMat* _dst, const int* _sat_tab, const float* _sqr_tab)
+ +    DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const float* _sqr_tab)
       {
           src = _src;
           dst = _dst;
@@@ -489,9 -490,9 +489,9 @@@
   };
   
   
- struct DTRowInvoker
+ struct DTRowInvoker : ParallelLoopBody
   {
- -    DTRowInvoker( CvMat* _dst, const float* _sqr_tab, const float* _inv_tab )
+ +    DTRowInvoker( Mat* _dst, const float* _sqr_tab, const float* _inv_tab )
       {
           dst = _dst;
           sqr_tab = _sqr_tab;
@@@ -578,7 -586,7 +578,7 @@@ trueDistTrans( const Mat& src, Mat& ds
       for( ; i <= m*3; i++ )
           sat_tab[i] = i - shift;
   
-     cv::parallel_for(cv::BlockedRange(0, n), cv::DTColumnInvoker(&src, &dst, sat_tab, sqr_tab));
- -    cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
++    cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(&src, &dst, sat_tab, sqr_tab));
   
       // stage 2: compute modified distance transform for each row
       float* inv_tab = sqr_tab + n;
@@@ -590,7 -598,7 +590,7 @@@
           sqr_tab[i] = (float)(i*i);
       }
   
-     cv::parallel_for(cv::BlockedRange(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
- -    cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
++    cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
   }
   
   
diff --cc modules/imgproc/src/histogram.cpp
Simple merge
diff --cc modules/imgproc/src/morph.cpp

index b4e08a6,53d2347..e0e27cc
--- 1/modules/imgproc/src/morph.cpp
--- 2/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@@ -1102,10 -1102,10 +1102,10 @@@ public
           columnBorderType = _columnBorderType;
       }
   
-     void operator () ( const BlockedRange& range ) const
+     void operator () ( const Range& range ) const
       {
-         int row0 = std::min(cvRound(range.begin() * src.rows / nStripes), src.rows);
-         int row1 = std::min(cvRound(range.end() * src.rows / nStripes), src.rows);
- -        int row0 = min(cvRound(range.start * src.rows / nStripes), src.rows);
- -        int row1 = min(cvRound(range.end * src.rows / nStripes), src.rows);
++        int row0 = std::min(cvRound(range.start * src.rows / nStripes), src.rows);
++        int row1 = std::min(cvRound(range.end * src.rows / nStripes), src.rows);
   
           /*if(0)
               printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
diff --cc modules/ml/src/gbt.cpp
Simple merge
diff --cc modules/ml/src/svm.cpp
Simple merge
diff --cc modules/nonfree/src/surf.cpp

index 34d7b96,2fc459f..98d1449
--- 1/modules/nonfree/src/surf.cpp
--- 2/modules/nonfree/src/surf.cpp
+++ b/modules/nonfree/src/surf.cpp
@@@ -258,11 -258,11 +258,11 @@@ interpolateKeypoint( float N9[3][9], in
   }
   
   // Multi-threaded construction of the scale-space pyramid
- struct SURFBuildInvoker
+ struct SURFBuildInvoker : ParallelLoopBody
   {
- -    SURFBuildInvoker( const Mat& _sum, const vector<int>& _sizes,
- -                      const vector<int>& _sampleSteps,
- -                      vector<Mat>& _dets, vector<Mat>& _traces )
+ +    SURFBuildInvoker( const Mat& _sum, const std::vector<int>& _sizes,
+ +                      const std::vector<int>& _sampleSteps,
+ +                      std::vector<Mat>& _dets, std::vector<Mat>& _traces )
       {
           sum = &_sum;
           sizes = &_sizes;
@@@ -285,12 -285,12 +285,12 @@@
   };
   
   // Multi-threaded search of the scale-space pyramid for keypoints
- struct SURFFindInvoker
+ struct SURFFindInvoker : ParallelLoopBody
   {
       SURFFindInvoker( const Mat& _sum, const Mat& _mask_sum,
- -                     const vector<Mat>& _dets, const vector<Mat>& _traces,
- -                     const vector<int>& _sizes, const vector<int>& _sampleSteps,
- -                     const vector<int>& _middleIndices, vector<KeyPoint>& _keypoints,
+ +                     const std::vector<Mat>& _dets, const std::vector<Mat>& _traces,
+ +                     const std::vector<int>& _sizes, const std::vector<int>& _sampleSteps,
+ +                     const std::vector<int>& _middleIndices, std::vector<KeyPoint>& _keypoints,
                        int _nOctaveLayers, float _hessianThreshold )
       {
           sum = &_sum;
@@@ -306,13 -306,13 +306,13 @@@
       }
   
       static void findMaximaInLayer( const Mat& sum, const Mat& mask_sum,
- -                   const vector<Mat>& dets, const vector<Mat>& traces,
- -                   const vector<int>& sizes, vector<KeyPoint>& keypoints,
+ +                   const std::vector<Mat>& dets, const std::vector<Mat>& traces,
+ +                   const std::vector<int>& sizes, std::vector<KeyPoint>& keypoints,
                      int octave, int layer, float hessianThreshold, int sampleStep );
   
-     void operator()(const BlockedRange& range) const
+     void operator()(const Range& range) const
       {
-         for( int i=range.begin(); i<range.end(); i++ )
+         for( int i=range.start; i<range.end; i++ )
           {
               int layer = (*middleIndices)[i];
               int octave = i / nOctaveLayers;
diff --cc modules/objdetect/src/cascadedetect.cpp
Simple merge
diff --cc modules/objdetect/src/latentsvm.cpp
Simple merge
diff --cc modules/ocl/include/opencv2/ocl.hpp

index 0df96db,0000000..4586226

mode 100644,000000..100644
--- 1/modules/ocl/include/opencv2/ocl.hpp
--- /dev/null
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@@ -1,1690 -1,0 +1,1743 @@@
- 
+ +/*M///////////////////////////////////////////////////////////////////////////////////////
+ +//
+ +//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ +//
+ +//  By downloading, copying, installing or using the software you agree to this license.
+ +//  If you do not agree to this license, do not download, install,
+ +//  copy or use the software.
+ +//
+ +//
+ +//                           License Agreement
+ +//                For Open Source Computer Vision Library
+ +//
+ +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+ +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+ +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+ +// Third party copyrights are property of their respective owners.
+ +//
+ +// Redistribution and use in source and binary forms, with or without modification,
+ +// are permitted provided that the following conditions are met:
+ +//
+ +//   * Redistribution's of source code must retain the above copyright notice,
+ +//     this list of conditions and the following disclaimer.
+ +//
+ +//   * Redistribution's in binary form must reproduce the above copyright notice,
+ +//     this list of conditions and the following disclaimer in the documentation
+ +//     and/or other oclMaterials provided with the distribution.
+ +//
+ +//   * The name of the copyright holders may not be used to endorse or promote products
+ +//     derived from this software without specific prior written permission.
+ +//
+ +// This software is provided by the copyright holders and contributors "as is" and
+ +// any express or implied warranties, including, but not limited to, the implied
+ +// warranties of merchantability and fitness for a particular purpose are disclaimed.
+ +// In no event shall the Intel Corporation or contributors be liable for any direct,
+ +// indirect, incidental, special, exemplary, or consequential damages
+ +// (including, but not limited to, procurement of substitute goods or services;
+ +// loss of use, data, or profits; or business interruption) however caused
+ +// and on any theory of liability, whether in contract, strict liability,
+ +// or tort (including negligence or otherwise) arising in any way out of
+ +// the use of this software, even if advised of the possibility of such damage.
+ +//
+ +//M*/
+ +
+ +#ifndef __OPENCV_OCL_HPP__
+ +#define __OPENCV_OCL_HPP__
+ +
+ +#include <memory>
+ +#include <vector>
+ +
+ +#include "opencv2/core.hpp"
+ +#include "opencv2/imgproc.hpp"
+ +#include "opencv2/objdetect.hpp"
+ +
+ +namespace cv
+ +{
+ +    namespace ocl
+ +    {
+ +        enum
+ +        {
+ +            CVCL_DEVICE_TYPE_DEFAULT     = (1 << 0),
+ +            CVCL_DEVICE_TYPE_CPU         = (1 << 1),
+ +            CVCL_DEVICE_TYPE_GPU         = (1 << 2),
+ +            CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3),
+ +            //CVCL_DEVICE_TYPE_CUSTOM      = (1 << 4)
+ +            CVCL_DEVICE_TYPE_ALL         = 0xFFFFFFFF
+ +        };
+ +
+ +        enum DevMemRW
+ +        {
+ +            DEVICE_MEM_R_W = 0,
+ +            DEVICE_MEM_R_ONLY,
+ +            DEVICE_MEM_W_ONLY
+ +        };
+ +
+ +        enum DevMemType
+ +        {
+ +            DEVICE_MEM_DEFAULT = 0,
+ +            DEVICE_MEM_AHP,         //alloc host pointer
+ +            DEVICE_MEM_UHP,         //use host pointer
+ +            DEVICE_MEM_CHP,         //copy host pointer
+ +            DEVICE_MEM_PM           //persistent memory
+ +        };
+ +
+ +        //Get the global device memory and read/write type
+ +        //return 1 if unified memory system supported, otherwise return 0
+ +        CV_EXPORTS int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type);
+ +
+ +        //Set the global device memory and read/write type,
+ +        //the newly generated oclMat will all use this type
+ +        //return -1 if the target type is unsupported, otherwise return 0
+ +        CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
+ +
+ +        //this class contains ocl runtime information
+ +        class CV_EXPORTS Info
+ +        {
+ +        public:
+ +            struct Impl;
+ +            Impl *impl;
+ +
+ +            Info();
+ +            Info(const Info &m);
+ +            ~Info();
+ +            void release();
+ +            Info &operator = (const Info &m);
+ +            std::vector<String> DeviceName;
+ +            String PlatformName;
+ +        };
+ +        //////////////////////////////// Initialization & Info ////////////////////////
+ +        //this function may be obsoleted
+ +        //CV_EXPORTS cl_device_id getDevice();
+ +        //the function must be called before any other cv::ocl::functions, it initialize ocl runtime
+ +        //each Info relates to an OpenCL platform
+ +        //there is one or more devices in each platform, each one has a separate name
+ +        CV_EXPORTS int getDevice(std::vector<Info> &oclinfo, int devicetype = CVCL_DEVICE_TYPE_GPU);
+ +
+ +        //set device you want to use, optional function after getDevice be called
+ +        //the devnum is the index of the selected device in DeviceName vector of INfo
+ +        CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0);
+ +
+ +        //optional function, if you want save opencl binary kernel to the file, set its path
+ +        CV_EXPORTS  void setBinpath(const char *path);
+ +
+ +        //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
++        //returns cl_context *
+ +        CV_EXPORTS void* getoclContext();
-             ~OclCascadeClassifierBuf() {}
++        //returns cl_command_queue *
+ +        CV_EXPORTS void* getoclCommandQueue();
+ +
+ +        //explicit call clFinish. The global command queue will be used.
+ +        CV_EXPORTS void finish();
+ +
+ +        //this function enable ocl module to use customized cl_context and cl_command_queue
+ +        //getDevice also need to be called before this function
+ +        CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
+ +
+ +        //////////////////////////////// OpenCL context ////////////////////////
+ +        //This is a global singleton class used to represent a OpenCL context.
+ +        class CV_EXPORTS Context
+ +        {
+ +        protected:
+ +            Context();
+ +            friend class std::auto_ptr<Context>;
+ +
+ +        private:
+ +            static std::auto_ptr<Context> clCxt;
+ +            static int val;
+ +        public:
+ +            ~Context();
+ +            void release();
+ +            Info::Impl* impl;
+ +
+ +            static Context *getContext();
+ +            static void setContext(Info &oclinfo);
+ +
+ +            enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
+ +            bool supportsFeature(int ftype);
+ +            size_t computeUnits();
+ +            size_t maxWorkGroupSize();
+ +            void* oclContext();
+ +            void* oclCommandQueue();
+ +        };
+ +
+ +        //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
+ +        CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
+ +                                                        const char **source, String kernelName,
+ +                                                        size_t globalThreads[3], size_t localThreads[3],
+ +                                                        std::vector< std::pair<size_t, const void *> > &args,
+ +                                                        int channels, int depth, const char *build_options,
+ +                                                        bool finish = true, bool measureKernelTime = false,
+ +                                                        bool cleanUp = true);
+ +
+ +        //! Calls a kernel, by file. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
+ +        CV_EXPORTS double openCLExecuteKernelInterop(Context *clCxt ,
+ +                                                        const char **fileName, const int numFiles, String kernelName,
+ +                                                        size_t globalThreads[3], size_t localThreads[3],
+ +                                                        std::vector< std::pair<size_t, const void *> > &args,
+ +                                                        int channels, int depth, const char *build_options,
+ +                                                        bool finish = true, bool measureKernelTime = false,
+ +                                                        bool cleanUp = true);
+ +
+ +        class CV_EXPORTS oclMatExpr;
+ +        //////////////////////////////// oclMat ////////////////////////////////
+ +        class CV_EXPORTS oclMat
+ +        {
+ +        public:
+ +            //! default constructor
+ +            oclMat();
+ +            //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
+ +            oclMat(int rows, int cols, int type);
+ +            oclMat(Size size, int type);
+ +            //! constucts oclMatrix and fills it with the specified value _s.
+ +            oclMat(int rows, int cols, int type, const Scalar &s);
+ +            oclMat(Size size, int type, const Scalar &s);
+ +            //! copy constructor
+ +            oclMat(const oclMat &m);
+ +
+ +            //! constructor for oclMatrix headers pointing to user-allocated data
+ +            oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP);
+ +            oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP);
+ +
+ +            //! creates a matrix header for a part of the bigger matrix
+ +            oclMat(const oclMat &m, const Range &rowRange, const Range &colRange);
+ +            oclMat(const oclMat &m, const Rect &roi);
+ +
+ +            //! builds oclMat from Mat. Perfom blocking upload to device.
+ +            explicit oclMat (const Mat &m);
+ +
+ +            //! destructor - calls release()
+ +            ~oclMat();
+ +
+ +            //! assignment operators
+ +            oclMat &operator = (const oclMat &m);
+ +            //! assignment operator. Perfom blocking upload to device.
+ +            oclMat &operator = (const Mat &m);
+ +            oclMat &operator = (const oclMatExpr& expr);
+ +
+ +            //! pefroms blocking upload data to oclMat.
+ +            void upload(const cv::Mat &m);
+ +
+ +
+ +            //! downloads data from device to host memory. Blocking calls.
+ +            operator Mat() const;
+ +            void download(cv::Mat &m) const;
+ +
+ +
+ +            //! returns a new oclMatrix header for the specified row
+ +            oclMat row(int y) const;
+ +            //! returns a new oclMatrix header for the specified column
+ +            oclMat col(int x) const;
+ +            //! ... for the specified row span
+ +            oclMat rowRange(int startrow, int endrow) const;
+ +            oclMat rowRange(const Range &r) const;
+ +            //! ... for the specified column span
+ +            oclMat colRange(int startcol, int endcol) const;
+ +            oclMat colRange(const Range &r) const;
+ +
+ +            //! returns deep copy of the oclMatrix, i.e. the data is copied
+ +            oclMat clone() const;
+ +            //! copies the oclMatrix content to "m".
+ +            // It calls m.create(this->size(), this->type()).
+ +            // It supports any data type
+ +            void copyTo( oclMat &m ) const;
+ +            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
+ +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+ +            void copyTo( oclMat &m, const oclMat &mask ) const;
+ +            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
+ +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+ +            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
+ +
+ +            void assignTo( oclMat &m, int type = -1 ) const;
+ +
+ +            //! sets every oclMatrix element to s
+ +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+ +            oclMat& operator = (const Scalar &s);
+ +            //! sets some of the oclMatrix elements to s, according to the mask
+ +            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+ +            oclMat& setTo(const Scalar &s, const oclMat &mask = oclMat());
+ +            //! creates alternative oclMatrix header for the same data, with different
+ +            // number of channels and/or different number of rows. see cvReshape.
+ +            oclMat reshape(int cn, int rows = 0) const;
+ +
+ +            //! allocates new oclMatrix data unless the oclMatrix already has specified size and type.
+ +            // previous data is unreferenced if needed.
+ +            void create(int rows, int cols, int type);
+ +            void create(Size size, int type);
+ +
+ +            //! allocates new oclMatrix with specified device memory type.
+ +            void createEx(int rows, int cols, int type,
+ +                          DevMemRW rw_type, DevMemType mem_type, void* hptr = 0);
+ +            void createEx(Size size, int type, DevMemRW rw_type,
+ +                          DevMemType mem_type, void* hptr = 0);
+ +
+ +            //! decreases reference counter;
+ +            // deallocate the data when reference counter reaches 0.
+ +            void release();
+ +
+ +            //! swaps with other smart pointer
+ +            void swap(oclMat &mat);
+ +
+ +            //! locates oclMatrix header within a parent oclMatrix. See below
+ +            void locateROI( Size &wholeSize, Point &ofs ) const;
+ +            //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix.
+ +            oclMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
+ +            //! extracts a rectangular sub-oclMatrix
+ +            // (this is a generalized form of row, rowRange etc.)
+ +            oclMat operator()( Range rowRange, Range colRange ) const;
+ +            oclMat operator()( const Rect &roi ) const;
+ +
+ +            oclMat& operator+=( const oclMat& m );
+ +            oclMat& operator-=( const oclMat& m );
+ +            oclMat& operator*=( const oclMat& m );
+ +            oclMat& operator/=( const oclMat& m );
+ +
+ +            //! returns true if the oclMatrix data is continuous
+ +            // (i.e. when there are no gaps between successive rows).
+ +            // similar to CV_IS_oclMat_CONT(cvoclMat->type)
+ +            bool isContinuous() const;
+ +            //! returns element size in bytes,
+ +            // similar to CV_ELEM_SIZE(cvMat->type)
+ +            size_t elemSize() const;
+ +            //! returns the size of element channel in bytes.
+ +            size_t elemSize1() const;
+ +            //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
+ +            int type() const;
+ +            //! returns element type, i.e. 8UC3 returns 8UC4 because in ocl
+ +            //! 3 channels element actually use 4 channel space
+ +            int ocltype() const;
+ +            //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
+ +            int depth() const;
+ +            //! returns element type, similar to CV_MAT_CN(cvMat->type)
+ +            int channels() const;
+ +            //! returns element type, return 4 for 3 channels element,
+ +            //!becuase 3 channels element actually use 4 channel space
+ +            int oclchannels() const;
+ +            //! returns step/elemSize1()
+ +            size_t step1() const;
+ +            //! returns oclMatrix size:
+ +            // width == number of columns, height == number of rows
+ +            Size size() const;
+ +            //! returns true if oclMatrix data is NULL
+ +            bool empty() const;
+ +
+ +            //! returns pointer to y-th row
+ +            uchar* ptr(int y = 0);
+ +            const uchar *ptr(int y = 0) const;
+ +
+ +            //! template version of the above method
+ +            template<typename _Tp> _Tp *ptr(int y = 0);
+ +            template<typename _Tp> const _Tp *ptr(int y = 0) const;
+ +
+ +            //! matrix transposition
+ +            oclMat t() const;
+ +
+ +            /*! includes several bit-fields:
+ +              - the magic signature
+ +              - continuity flag
+ +              - depth
+ +              - number of channels
+ +              */
+ +            int flags;
+ +            //! the number of rows and columns
+ +            int rows, cols;
+ +            //! a distance between successive rows in bytes; includes the gap if any
+ +            size_t step;
+ +            //! pointer to the data(OCL memory object)
+ +            uchar *data;
+ +
+ +            //! pointer to the reference counter;
+ +            // when oclMatrix points to user-allocated data, the pointer is NULL
+ +            int *refcount;
+ +
+ +            //! helper fields used in locateROI and adjustROI
+ +            //datastart and dataend are not used in current version
+ +            uchar *datastart;
+ +            uchar *dataend;
+ +
+ +            //! OpenCL context associated with the oclMat object.
+ +            Context *clCxt;
+ +            //add offset for handle ROI, calculated in byte
+ +            int offset;
+ +            //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
+ +            int wholerows;
+ +            int wholecols;
+ +        };
+ +
+ +
+ +        ///////////////////// mat split and merge /////////////////////////////////
+ +        //! Compose a multi-channel array from several single-channel arrays
+ +        // Support all types
+ +        CV_EXPORTS void merge(const oclMat *src, size_t n, oclMat &dst);
+ +        CV_EXPORTS void merge(const std::vector<oclMat> &src, oclMat &dst);
+ +
+ +        //! Divides multi-channel array into several single-channel arrays
+ +        // Support all types
+ +        CV_EXPORTS void split(const oclMat &src, oclMat *dst);
+ +        CV_EXPORTS void split(const oclMat &src, std::vector<oclMat> &dst);
+ +
+ +        ////////////////////////////// Arithmetics ///////////////////////////////////
+ +        //#if defined DOUBLE_SUPPORT
+ +        //typedef double F;
+ +        //#else
+ +        //typedef float F;
+ +        //#endif
+ +        //    CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
+ +        CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
+ +        //! adds one matrix to another (c = a + b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c);
+ +        //! adds one matrix to another (c = a + b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
+ +        //! adds scalar to a matrix (c = a + s)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
+ +        //! subtracts one matrix from another (c = a - b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c);
+ +        //! subtracts one matrix from another (c = a - b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
+ +        //! subtracts scalar from a matrix (c = a - s)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
+ +        //! subtracts scalar from a matrix (c = a - s)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat());
+ +        //! computes element-wise product of the two arrays (c = a * b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+ +        //! multiplies matrix to a number (dst = scalar * src)
+ +        // supports CV_32FC1 only
+ +        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
+ +        //! computes element-wise quotient of the two arrays (c = a / b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+ +        //! computes element-wise quotient of the two arrays (c = a / b)
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void divide(double scale, const oclMat &b, oclMat &c);
+ +
+ +        //! compares elements of two arrays (c = a <cmpop> b)
+ +        // supports except CV_8SC1,CV_8SC2,CV8SC3,CV_8SC4 types
+ +        CV_EXPORTS void compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop);
+ +
+ +        //! transposes the matrix
+ +        // supports  CV_8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1.(the same as cuda)
+ +        CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);
+ +
+ +        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void absdiff(const oclMat &a, const oclMat &b, oclMat &c);
+ +        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
+ +        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+ +        CV_EXPORTS void absdiff(const oclMat &a, const Scalar &s, oclMat &c);
+ +
+ +        //! computes mean value and standard deviation of all or selected array elements
+ +        // supports except CV_32F,CV_64F
+ +        CV_EXPORTS void meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev);
+ +
+ +        //! computes norm of array
+ +        // supports NORM_INF, NORM_L1, NORM_L2
+ +        // supports only CV_8UC1 type
+ +        CV_EXPORTS double norm(const oclMat &src1, int normType = NORM_L2);
+ +
+ +        //! computes norm of the difference between two arrays
+ +        // supports NORM_INF, NORM_L1, NORM_L2
+ +        // supports only CV_8UC1 type
+ +        CV_EXPORTS double norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2);
+ +
+ +        //! reverses the order of the rows, columns or both in a matrix
+ +        // supports all types
+ +        CV_EXPORTS void flip(const oclMat &a, oclMat &b, int flipCode);
+ +
+ +        //! computes sum of array elements
+ +        // disabled until fix crash
+ +        // support all types
+ +        CV_EXPORTS Scalar sum(const oclMat &m);
+ +        CV_EXPORTS Scalar absSum(const oclMat &m);
+ +        CV_EXPORTS Scalar sqrSum(const oclMat &m);
+ +
+ +        //! finds global minimum and maximum array elements and returns their values
+ +        // support all C1 types
+ +
+ +        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
++        CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);
+ +
+ +        //! finds global minimum and maximum array elements and returns their values with locations
+ +        // support all C1 types
+ +
+ +        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
+ +                                  const oclMat &mask = oclMat());
+ +
+ +        //! counts non-zero array elements
+ +        // support all types
+ +        CV_EXPORTS int countNonZero(const oclMat &src);
+ +
+ +        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
+ +        // destination array will have the depth type as lut and the same channels number as source
+ +        //It supports 8UC1 8UC4 only
+ +        CV_EXPORTS void LUT(const oclMat &src, const oclMat &lut, oclMat &dst);
+ +
+ +        //! only 8UC1 and 256 bins is supported now
+ +        CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
+ +        //! only 8UC1 and 256 bins is supported now
+ +        CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
+ +        //! bilateralFilter
+ +        // supports 8UC1 8UC4
+ +        CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpave, int borderType=BORDER_DEFAULT);
+ +        //! computes exponent of each matrix element (b = e**a)
+ +        // supports only CV_32FC1 type
+ +        CV_EXPORTS void exp(const oclMat &a, oclMat &b);
+ +
+ +        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
+ +        // supports only CV_32FC1 type
+ +        CV_EXPORTS void log(const oclMat &a, oclMat &b);
+ +
+ +        //! computes magnitude of each (x(i), y(i)) vector
+ +        // supports only CV_32F CV_64F type
+ +        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
+ +        CV_EXPORTS void magnitudeSqr(const oclMat &x, const oclMat &y, oclMat &magnitude);
+ +
+ +        CV_EXPORTS void magnitudeSqr(const oclMat &x, oclMat &magnitude);
+ +
+ +        //! computes angle (angle(i)) of each (x(i), y(i)) vector
+ +        // supports only CV_32F CV_64F type
+ +        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
+ +
+ +        //! the function raises every element of tne input array to p
+ +        //! support only CV_32F CV_64F type
+ +        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
+ +
+ +        //! converts Cartesian coordinates to polar
+ +        // supports only CV_32F CV_64F type
+ +        CV_EXPORTS void cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false);
+ +
+ +        //! converts polar coordinates to Cartesian
+ +        // supports only CV_32F CV_64F type
+ +        CV_EXPORTS void polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false);
+ +
+ +        //! perfroms per-elements bit-wise inversion
+ +        // supports all types
+ +        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
+ +        //! calculates per-element bit-wise disjunction of two arrays
+ +        // supports all types
+ +        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+ +        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+ +        //! calculates per-element bit-wise conjunction of two arrays
+ +        // supports all types
+ +        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+ +        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+ +        //! calculates per-element bit-wise "exclusive or" operation
+ +        // supports all types
+ +        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+ +        CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+ +
+ +        //! Logical operators
+ +        CV_EXPORTS oclMat operator ~ (const oclMat &);
+ +        CV_EXPORTS oclMat operator | (const oclMat &, const oclMat &);
+ +        CV_EXPORTS oclMat operator & (const oclMat &, const oclMat &);
+ +        CV_EXPORTS oclMat operator ^ (const oclMat &, const oclMat &);
+ +
+ +
+ +        //! Mathematics operators
+ +        CV_EXPORTS oclMatExpr operator + (const oclMat &src1, const oclMat &src2);
+ +        CV_EXPORTS oclMatExpr operator - (const oclMat &src1, const oclMat &src2);
+ +        CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
+ +        CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
+ +
+ +        struct CV_EXPORTS ConvolveBuf
+ +        {
+ +            Size result_size;
+ +            Size block_size;
+ +            Size user_block_size;
+ +            Size dft_size;
+ +
+ +            oclMat image_spect, templ_spect, result_spect;
+ +            oclMat image_block, templ_block, result_data;
+ +
+ +            void create(Size image_size, Size templ_size);
+ +            static Size estimateBlockSize(Size result_size, Size templ_size);
+ +        };
+ +
+ +        //! computes convolution of two images, may use discrete Fourier transform
+ +        //! support only CV_32FC1 type
+ +        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
+ +        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
+ +
+ +        //! Performs a per-element multiplication of two Fourier spectrums.
+ +        //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
+ +        //! support only CV_32FC2 type
+ +        CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false);
+ +
+ +        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
+ +
+ +        //////////////////////////////// Filter Engine ////////////////////////////////
+ +
+ +        /*!
+ +          The Base Class for 1D or Row-wise Filters
+ +
+ +          This is the base class for linear or non-linear filters that process 1D data.
+ +          In particular, such filters are used for the "horizontal" filtering parts in separable filters.
+ +          */
+ +        class CV_EXPORTS BaseRowFilter_GPU
+ +        {
+ +        public:
+ +            BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+ +            virtual ~BaseRowFilter_GPU() {}
+ +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+ +            int ksize, anchor, bordertype;
+ +        };
+ +
+ +        /*!
+ +          The Base Class for Column-wise Filters
+ +
+ +          This is the base class for linear or non-linear filters that process columns of 2D arrays.
+ +          Such filters are used for the "vertical" filtering parts in separable filters.
+ +          */
+ +        class CV_EXPORTS BaseColumnFilter_GPU
+ +        {
+ +        public:
+ +            BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+ +            virtual ~BaseColumnFilter_GPU() {}
+ +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+ +            int ksize, anchor, bordertype;
+ +        };
+ +
+ +        /*!
+ +          The Base Class for Non-Separable 2D Filters.
+ +
+ +          This is the base class for linear or non-linear 2D filters.
+ +          */
+ +        class CV_EXPORTS BaseFilter_GPU
+ +        {
+ +        public:
+ +            BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
+ +                : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
+ +            virtual ~BaseFilter_GPU() {}
+ +            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+ +            Size ksize;
+ +            Point anchor;
+ +            int borderType;
+ +        };
+ +
+ +        /*!
+ +          The Base Class for Filter Engine.
+ +
+ +          The class can be used to apply an arbitrary filtering operation to an image.
+ +          It contains all the necessary intermediate buffers.
+ +          */
+ +        class CV_EXPORTS FilterEngine_GPU
+ +        {
+ +        public:
+ +            virtual ~FilterEngine_GPU() {}
+ +
+ +            virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
+ +        };
+ +
+ +        //! returns the non-separable filter engine with the specified filter
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
+ +
+ +        //! returns the primitive row filter with the specified kernel
+ +        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel,
+ +                int anchor = -1, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! returns the primitive column filter with the specified kernel
+ +        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel,
+ +                int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0);
+ +
+ +        //! returns the separable linear filter engine
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
+ +                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! returns the separable filter engine with the specified filters
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
+ +                const Ptr<BaseColumnFilter_GPU> &columnFilter);
+ +
+ +        //! returns the Gaussian filter engine
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! returns filter engine for the generalized Sobel operator
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
+ +
+ +        //! applies Laplacian operator to the image
+ +        // supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type
+ +        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1);
+ +
+ +        //! returns 2D box filter
+ +        // supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
+ +        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
+ +                const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+ +
+ +        //! returns box filter engine
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size &ksize,
+ +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+ +
+ +        //! returns 2D filter with the specified kernel
+ +        // supports CV_8UC1 and CV_8UC4 types
+ +        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
+ +                Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+ +
+ +        //! returns the non-separable linear filter engine
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
+ +                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+ +
+ +        //! smooths the image using the normalized box filter
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
+ +        CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
+ +                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+ +
+ +        //! returns 2D morphological filter
+ +        //! only MORPH_ERODE and MORPH_DILATE are supported
+ +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+ +        // kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
+ +        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize,
+ +                Point anchor = Point(-1, -1));
+ +
+ +        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
+ +        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat &kernel,
+ +                const Point &anchor = Point(-1, -1), int iterations = 1);
+ +
+ +        //! a synonym for normalized box filter
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+ +        static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
+ +                                int borderType = BORDER_CONSTANT)
+ +        {
+ +            boxFilter(src, dst, -1, ksize, anchor, borderType);
+ +        }
+ +
+ +        //! applies non-separable 2D linear filter to the image
+ +        CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
+ +                                 Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+ +
+ +        //! applies separable 2D linear filter to the image
+ +        CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
+ +                                    Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! applies generalized Sobel operator to the image
+ +        // dst.type must equalize src.type
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+ +        CV_EXPORTS void Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! applies the vertical or horizontal Scharr operator to the image
+ +        // dst.type must equalize src.type
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+ +        CV_EXPORTS void Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! smooths the image using Gaussian filter.
+ +        // dst.type must equalize src.type
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+ +        CV_EXPORTS void GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+ +
+ +        //! erodes the image (applies the local minimum operator)
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
+ +
+ +                               int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
+ +
+ +
+ +        //! dilates the image (applies the local maximum operator)
+ +        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+ +        CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
+ +
+ +                                int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
+ +
+ +
+ +        //! applies an advanced morphological operation to the image
+ +        CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
+ +
+ +                                      int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
+ +
+ +
+ +        ////////////////////////////// Image processing //////////////////////////////
+ +        //! Does mean shift filtering on GPU.
+ +        CV_EXPORTS void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr,
+ +                                           TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+ +
+ +        //! Does mean shift procedure on GPU.
+ +        CV_EXPORTS void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr,
+ +                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+ +
+ +        //! Does mean shift segmentation with elimiation of small regions.
+ +        CV_EXPORTS void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize,
+ +                                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+ +
+ +        //! applies fixed threshold to the image.
+ +        // supports CV_8UC1 and CV_32FC1 data type
+ +        // supports threshold type: THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV
+ +        CV_EXPORTS double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC);
+ +
+ +        //! resizes the image
+ +        // Supports INTER_NEAREST, INTER_LINEAR
+ +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+ +        CV_EXPORTS void resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
+ +
+ +        //! Applies a generic geometrical transformation to an image.
+ +
+ +        // Supports INTER_NEAREST, INTER_LINEAR.
+ +
+ +        // Map1 supports CV_16SC2, CV_32FC2  types.
+ +
+ +        // Src supports CV_8UC1, CV_8UC2, CV_8UC4.
+ +
+ +        CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar());
+ +
+ +        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
+ +        // supports CV_8UC1, CV_8UC4, CV_32SC1 types
+ +        CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar());
+ +
+ +        //! Smoothes image using median filter
+ +        // The source 1- or 4-channel image. When m is 3 or 5, the image depth should be CV 8U or CV 32F.
+ +        CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m);
+ +
+ +        //! warps the image using affine transformation
+ +        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+ +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+ +        CV_EXPORTS void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
+ +
+ +        //! warps the image using perspective transformation
+ +        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+ +        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+ +        CV_EXPORTS void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
+ +
+ +        //! computes the integral image and integral for the squared image
+ +        // sum will have CV_32S type, sqsum - CV32F type
+ +        // supports only CV_8UC1 source type
+ +        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
+ +        CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
+ +        CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
++        CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
++            int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
+ +        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
++        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
++            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
+ +
+ +        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ +        ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
+ +        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ +
+ +#if 0
+ +        class CV_EXPORTS OclCascadeClassifier : public  cv::CascadeClassifier
+ +        {
+ +        public:
+ +            OclCascadeClassifier() {};
+ +            ~OclCascadeClassifier() {};
+ +
+ +            CvSeq* oclHaarDetectObjects(oclMat &gimg, CvMemStorage *storage, double scaleFactor,
+ +                                        int minNeighbors, int flags, CvSize minSize = cvSize(0, 0), CvSize maxSize = cvSize(0, 0));
+ +        };
+ +#endif
+ +
++#if 0
+ +        class CV_EXPORTS OclCascadeClassifierBuf : public  cv::CascadeClassifier
+ +        {
+ +        public:
+ +            OclCascadeClassifierBuf() :
+ +                m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
+ +
++            ~OclCascadeClassifierBuf() { release(); }
+ +
+ +            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
+ +                                  double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
+ +                                  Size minSize = Size(), Size maxSize = Size());
+ +            void release();
+ +
+ +        private:
+ +            void Init(const int rows, const int cols, double scaleFactor, int flags,
+ +                      const int outputsz, const size_t localThreads[],
+ +                      Size minSize, Size maxSize);
+ +            void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz);
+ +            void CreateFactorRelatedBufs(const int rows, const int cols, const int flags,
+ +                                         const double scaleFactor, const size_t localThreads[],
+ +                                         Size minSize, Size maxSize);
+ +            void GenResult(CV_OUT std::vector<cv::Rect>& faces, const std::vector<cv::Rect> &rectList, const std::vector<int> &rweights);
+ +
+ +            int m_rows;
+ +            int m_cols;
+ +            int m_flags;
+ +            int m_loopcount;
+ +            int m_nodenum;
+ +            bool findBiggestObject;
+ +            bool initialized;
+ +            double m_scaleFactor;
+ +            Size m_minSize;
+ +            Size m_maxSize;
+ +            std::vector<Size> sizev;
+ +            std::vector<float> scalev;
+ +            oclMat gimg1, gsum, gsqsum;
+ +            void * buffers;
+ +        };
++#endif
+ +
+ +        /////////////////////////////// Pyramid /////////////////////////////////////
+ +        CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
+ +
+ +        //! upsamples the source image and then smoothes it
+ +        CV_EXPORTS void pyrUp(const oclMat &src, oclMat &dst);
+ +
+ +        //! performs linear blending of two images
+ +        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
+ +        // supports only CV_8UC1 source type
+ +        CV_EXPORTS void blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2, oclMat &result);
+ +
+ +        //! computes vertical sum, supports only CV_32FC1 images
+ +        CV_EXPORTS void columnSum(const oclMat &src, oclMat &sum);
+ +
+ +        ///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
+ +        struct CV_EXPORTS MatchTemplateBuf
+ +        {
+ +            Size user_block_size;
+ +            oclMat imagef, templf;
+ +            std::vector<oclMat> images;
+ +            std::vector<oclMat> image_sums;
+ +            std::vector<oclMat> image_sqsums;
+ +        };
+ +
+ +        //! computes the proximity map for the raster template and the image where the template is searched for
+ +        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
+ +        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
+ +        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method);
+ +
+ +        //! computes the proximity map for the raster template and the image where the template is searched for
+ +        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
+ +        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
+ +        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf);
+ +
+ +
+ +
+ +        ///////////////////////////////////////////// Canny /////////////////////////////////////////////
+ +        struct CV_EXPORTS CannyBuf;
+ +
+ +        //! compute edges of the input image using Canny operator
+ +        // Support CV_8UC1 only
+ +        CV_EXPORTS void Canny(const oclMat &image, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+ +        CV_EXPORTS void Canny(const oclMat &image, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+ +        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
+ +        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
+ +
+ +        struct CV_EXPORTS CannyBuf
+ +        {
+ +            CannyBuf() : counter(NULL) {}
+ +            ~CannyBuf()
+ +            {
+ +                release();
+ +            }
+ +            explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL)
+ +            {
+ +                create(image_size, apperture_size);
+ +            }
+ +            CannyBuf(const oclMat &dx_, const oclMat &dy_);
+ +            void create(const Size &image_size, int apperture_size = 3);
+ +            void release();
+ +
+ +            oclMat dx, dy;
+ +            oclMat dx_buf, dy_buf;
+ +            oclMat magBuf, mapBuf;
+ +            oclMat trackBuf1, trackBuf2;
+ +            void *counter;
+ +            Ptr<FilterEngine_GPU> filterDX, filterDY;
+ +        };
+ +
+ +        ///////////////////////////////////////// Hough Transform /////////////////////////////////////////
+ +        //! HoughCircles
+ +        struct HoughCirclesBuf
+ +        {
+ +            oclMat edges;
+ +            oclMat accum;
+ +            oclMat srcPoints;
+ +            oclMat centers;
+ +            CannyBuf cannyBuf;
+ +        };
+ +
+ +        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
+ +        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
+ +        CV_EXPORTS void HoughCirclesDownload(const oclMat& d_circles, OutputArray h_circles);
+ +
+ +
+ +        ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
+ +        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
+ +        //! Param dft_size is the size of DFT transform.
+ +        //!
+ +        //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format.
+ +        // support src type of CV32FC1, CV32FC2
+ +        // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS
+ +        // dft_size is the size of original input, which is used for transformation from complex to real.
+ +        // dft_size must be powers of 2, 3 and 5
+ +        // real to complex dft requires at least v1.8 clAmdFft
+ +        // real to complex dft output is not the same with cpu version
+ +        // real to complex and complex to real does not support DFT_ROWS
+ +        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
+ +
+ +        //! implements generalized matrix product algorithm GEMM from BLAS
+ +        // The functionality requires clAmdBlas library
+ +        // only support type CV_32FC1
+ +        // flag GEMM_3_T is not supported
+ +        CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha,
+ +                             const oclMat &src3, double beta, oclMat &dst, int flags = 0);
+ +
+ +        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
+ +
+ +        struct CV_EXPORTS HOGDescriptor
+ +
+ +        {
+ +
+ +            enum { DEFAULT_WIN_SIGMA = -1 };
+ +
+ +            enum { DEFAULT_NLEVELS = 64 };
+ +
+ +            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+ +
+ +
+ +
+ +            HOGDescriptor(Size win_size = Size(64, 128), Size block_size = Size(16, 16),
+ +
+ +                          Size block_stride = Size(8, 8), Size cell_size = Size(8, 8),
+ +
+ +                          int nbins = 9, double win_sigma = DEFAULT_WIN_SIGMA,
+ +
+ +                          double threshold_L2hys = 0.2, bool gamma_correction = true,
+ +
+ +                          int nlevels = DEFAULT_NLEVELS);
+ +
+ +
+ +
+ +            size_t getDescriptorSize() const;
+ +
+ +            size_t getBlockHistogramSize() const;
+ +
+ +
+ +
+ +            void setSVMDetector(const std::vector<float> &detector);
+ +
+ +
+ +
+ +            static std::vector<float> getDefaultPeopleDetector();
+ +
+ +            static std::vector<float> getPeopleDetector48x96();
+ +
+ +            static std::vector<float> getPeopleDetector64x128();
+ +
+ +
+ +
+ +            void detect(const oclMat &img, std::vector<Point> &found_locations,
+ +
+ +                        double hit_threshold = 0, Size win_stride = Size(),
+ +
+ +                        Size padding = Size());
+ +
+ +
+ +
+ +            void detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations,
+ +
+ +                                  double hit_threshold = 0, Size win_stride = Size(),
+ +
+ +                                  Size padding = Size(), double scale0 = 1.05,
+ +
+ +                                  int group_threshold = 2);
+ +
+ +
+ +
+ +            void getDescriptors(const oclMat &img, Size win_stride,
+ +
+ +                                oclMat &descriptors,
+ +
+ +                                int descr_format = DESCR_FORMAT_COL_BY_COL);
+ +
+ +
+ +
+ +            Size win_size;
+ +
+ +            Size block_size;
+ +
+ +            Size block_stride;
+ +
+ +            Size cell_size;
+ +
+ +            int nbins;
+ +
+ +            double win_sigma;
+ +
+ +            double threshold_L2hys;
+ +
+ +            bool gamma_correction;
+ +
+ +            int nlevels;
+ +
+ +
+ +
+ +        protected:
+ +
+ +            // initialize buffers; only need to do once in case of multiscale detection
+ +
+ +            void init_buffer(const oclMat &img, Size win_stride);
+ +
+ +
+ +
+ +            void computeBlockHistograms(const oclMat &img);
+ +
+ +            void computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle);
+ +
+ +
+ +
+ +            double getWinSigma() const;
+ +
+ +            bool checkDetectorSize() const;
+ +
+ +
+ +
+ +            static int numPartsWithin(int size, int part_size, int stride);
+ +
+ +            static Size numPartsWithin(Size size, Size part_size, Size stride);
+ +
+ +
+ +
+ +            // Coefficients of the separating plane
+ +
+ +            float free_coef;
+ +
+ +            oclMat detector;
+ +
+ +
+ +
+ +            // Results of the last classification step
+ +
+ +            oclMat labels;
+ +
+ +            Mat labels_host;
+ +
+ +
+ +
+ +            // Results of the last histogram evaluation step
+ +
+ +            oclMat block_hists;
+ +
+ +
+ +
+ +            // Gradients conputation results
+ +
+ +            oclMat grad, qangle;
+ +
+ +
+ +
+ +            // scaled image
+ +
+ +            oclMat image_scale;
+ +
+ +
+ +
+ +            // effect size of input image (might be different from original size after scaling)
+ +
+ +            Size effect_size;
+ +
+ +        };
+ +
+ +
+ +        ////////////////////////feature2d_ocl/////////////////
+ +        /****************************************************************************************\
+ +        *                                      Distance                                          *
+ +        \****************************************************************************************/
+ +        template<typename T>
+ +        struct CV_EXPORTS Accumulator
+ +        {
+ +            typedef T Type;
+ +        };
+ +        template<> struct Accumulator<unsigned char>
+ +        {
+ +            typedef float Type;
+ +        };
+ +        template<> struct Accumulator<unsigned short>
+ +        {
+ +            typedef float Type;
+ +        };
+ +        template<> struct Accumulator<char>
+ +        {
+ +            typedef float Type;
+ +        };
+ +        template<> struct Accumulator<short>
+ +        {
+ +            typedef float Type;
+ +        };
+ +
+ +        /*
+ +         * Manhattan distance (city block distance) functor
+ +         */
+ +        template<class T>
+ +        struct CV_EXPORTS L1
+ +        {
+ +            enum { normType = NORM_L1 };
+ +            typedef T ValueType;
+ +            typedef typename Accumulator<T>::Type ResultType;
+ +
+ +            ResultType operator()( const T *a, const T *b, int size ) const
+ +            {
+ +                return normL1<ValueType, ResultType>(a, b, size);
+ +            }
+ +        };
+ +
+ +        /*
+ +         * Euclidean distance functor
+ +         */
+ +        template<class T>
+ +        struct CV_EXPORTS L2
+ +        {
+ +            enum { normType = NORM_L2 };
+ +            typedef T ValueType;
+ +            typedef typename Accumulator<T>::Type ResultType;
+ +
+ +            ResultType operator()( const T *a, const T *b, int size ) const
+ +            {
+ +                return (ResultType)std::sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
+ +            }
+ +        };
+ +
+ +        /*
+ +         * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ +         * bit count of A exclusive XOR'ed with B
+ +         */
+ +        struct CV_EXPORTS Hamming
+ +        {
+ +            enum { normType = NORM_HAMMING };
+ +            typedef unsigned char ValueType;
+ +            typedef int ResultType;
+ +
+ +            /** this will count the bits in a ^ b
+ +             */
+ +            ResultType operator()( const unsigned char *a, const unsigned char *b, int size ) const
+ +            {
+ +                return normHamming(a, b, size);
+ +            }
+ +        };
+ +
+ +        ////////////////////////////////// BruteForceMatcher //////////////////////////////////
+ +
+ +        class CV_EXPORTS BruteForceMatcher_OCL_base
+ +        {
+ +        public:
+ +            enum DistType {L1Dist = 0, L2Dist, HammingDist};
+ +            explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
+ +
+ +            // Add descriptors to train descriptor collection
+ +            void add(const std::vector<oclMat> &descCollection);
+ +
+ +            // Get train descriptors collection
+ +            const std::vector<oclMat> &getTrainDescriptors() const;
+ +
+ +            // Clear train descriptors collection
+ +            void clear();
+ +
+ +            // Return true if there are not train descriptors in collection
+ +            bool empty() const;
+ +
+ +            // Return true if the matcher supports mask in match methods
+ +            bool isMaskSupported() const;
+ +
+ +            // Find one best match for each query descriptor
+ +            void matchSingle(const oclMat &query, const oclMat &train,
+ +                             oclMat &trainIdx, oclMat &distance,
+ +                             const oclMat &mask = oclMat());
+ +
+ +            // Download trainIdx and distance and convert it to CPU vector with DMatch
+ +            static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
+ +            // Convert trainIdx and distance to vector with DMatch
+ +            static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
+ +
+ +            // Find one best match for each query descriptor
+ +            void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
+ +
+ +            // Make gpu collection of trains and masks in suitable format for matchCollection function
+ +            void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
+ +
+ +            // Find one best match from train collection for each query descriptor
+ +            void matchCollection(const oclMat &query, const oclMat &trainCollection,
+ +                                 oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
+ +                                 const oclMat &masks = oclMat());
+ +
+ +            // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
+ +            static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
+ +            // Convert trainIdx, imgIdx and distance to vector with DMatch
+ +            static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
+ +
+ +            // Find one best match from train collection for each query descriptor.
+ +            void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
+ +
+ +            // Find k best matches for each query descriptor (in increasing order of distances)
+ +            void knnMatchSingle(const oclMat &query, const oclMat &train,
+ +                                oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
+ +                                const oclMat &mask = oclMat());
+ +
+ +            // Download trainIdx and distance and convert it to vector with DMatch
+ +            // compactResult is used when mask is not empty. If compactResult is false matches
+ +            // vector will have the same size as queryDescriptors rows. If compactResult is true
+ +            // matches vector will not contain matches for fully masked out query descriptors.
+ +            static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
+ +                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +            // Convert trainIdx and distance to vector with DMatch
+ +            static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
+ +                                        std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +
+ +            // Find k best matches for each query descriptor (in increasing order of distances).
+ +            // compactResult is used when mask is not empty. If compactResult is false matches
+ +            // vector will have the same size as queryDescriptors rows. If compactResult is true
+ +            // matches vector will not contain matches for fully masked out query descriptors.
+ +            void knnMatch(const oclMat &query, const oclMat &train,
+ +                          std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
+ +                          bool compactResult = false);
+ +
+ +            // Find k best matches from train collection for each query descriptor (in increasing order of distances)
+ +            void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
+ +                                     oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
+ +                                     const oclMat &maskCollection = oclMat());
+ +
+ +            // Download trainIdx and distance and convert it to vector with DMatch
+ +            // compactResult is used when mask is not empty. If compactResult is false matches
+ +            // vector will have the same size as queryDescriptors rows. If compactResult is true
+ +            // matches vector will not contain matches for fully masked out query descriptors.
+ +            static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
+ +                                          std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +            // Convert trainIdx and distance to vector with DMatch
+ +            static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
+ +                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +
+ +            // Find k best matches  for each query descriptor (in increasing order of distances).
+ +            // compactResult is used when mask is not empty. If compactResult is false matches
+ +            // vector will have the same size as queryDescriptors rows. If compactResult is true
+ +            // matches vector will not contain matches for fully masked out query descriptors.
+ +            void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
+ +                          const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
+ +
+ +            // Find best matches for each query descriptor which have distance less than maxDistance.
+ +            // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
+ +            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
+ +            // because it didn't have enough memory.
+ +            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
+ +            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+ +            // Matches doesn't sorted.
+ +            void radiusMatchSingle(const oclMat &query, const oclMat &train,
+ +                                   oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
+ +                                   const oclMat &mask = oclMat());
+ +
+ +            // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
+ +            // matches will be sorted in increasing order of distances.
+ +            // compactResult is used when mask is not empty. If compactResult is false matches
+ +            // vector will have the same size as queryDescriptors rows. If compactResult is true
+ +            // matches vector will not contain matches for fully masked out query descriptors.
+ +            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
+ +                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +            // Convert trainIdx, nMatches and distance to vector with DMatch.
+ +            static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
+ +                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +
+ +            // Find best matches for each query descriptor which have distance less than maxDistance
+ +            // in increasing order of distances).
+ +            void radiusMatch(const oclMat &query, const oclMat &train,
+ +                             std::vector< std::vector<DMatch> > &matches, float maxDistance,
+ +                             const oclMat &mask = oclMat(), bool compactResult = false);
+ +
+ +            // Find best matches for each query descriptor which have distance less than maxDistance.
+ +            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
+ +            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+ +            // Matches doesn't sorted.
+ +            void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
+ +                                       const std::vector<oclMat> &masks = std::vector<oclMat>());
+ +
+ +            // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
+ +            // matches will be sorted in increasing order of distances.
+ +            // compactResult is used when mask is not empty. If compactResult is false matches
+ +            // vector will have the same size as queryDescriptors rows. If compactResult is true
+ +            // matches vector will not contain matches for fully masked out query descriptors.
+ +            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
+ +                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +            // Convert trainIdx, nMatches and distance to vector with DMatch.
+ +            static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
+ +                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
+ +
+ +            // Find best matches from train collection for each query descriptor which have distance less than
+ +            // maxDistance (in increasing order of distances).
+ +            void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
+ +                             const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
+ +
+ +            DistType distType;
+ +
+ +        private:
+ +            std::vector<oclMat> trainDescCollection;
+ +        };
+ +
+ +        template <class Distance>
+ +        class CV_EXPORTS BruteForceMatcher_OCL;
+ +
+ +        template <typename T>
+ +        class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
+ +        {
+ +        public:
+ +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
+ +            explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
+ +        };
+ +        template <typename T>
+ +        class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
+ +        {
+ +        public:
+ +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
+ +            explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
+ +        };
+ +        template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
+ +        {
+ +        public:
+ +            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
+ +            explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
+ +        };
+ +
+ +        class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
+ +        {
+ +        public:
+ +            explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
+ +        };
+ +
++        class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
++        {
++        public:
++            explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
++                int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
++
++            //! return 1 rows matrix with CV_32FC2 type
++            void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
++            //! download points of type Point2f to a vector. the vector's content will be erased
++            void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
++
++            int maxCorners;
++            double qualityLevel;
++            double minDistance;
++
++            int blockSize;
++            bool useHarrisDetector;
++            double harrisK;
++            void releaseMemory()
++            {
++                Dx_.release();
++                Dy_.release();
++                eig_.release();
++                minMaxbuf_.release();
++                tmpCorners_.release();
++            }
++        private:
++            oclMat Dx_;
++            oclMat Dy_;
++            oclMat eig_;
++            oclMat minMaxbuf_;
++            oclMat tmpCorners_;
++        };
++
++        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
++            int blockSize_, bool useHarrisDetector_, double harrisK_)
++        {
++            maxCorners = maxCorners_;
++            qualityLevel = qualityLevel_;
++            minDistance = minDistance_;
++            blockSize = blockSize_;
++            useHarrisDetector = useHarrisDetector_;
++            harrisK = harrisK_;
++        }
++
+ +        /////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
+ +
+ +        class CV_EXPORTS PyrLKOpticalFlow
+ +        {
+ +        public:
+ +            PyrLKOpticalFlow()
+ +            {
+ +                winSize = Size(21, 21);
+ +                maxLevel = 3;
+ +                iters = 30;
+ +                derivLambda = 0.5;
+ +                useInitialFlow = false;
+ +                minEigThreshold = 1e-4f;
+ +                getMinEigenVals = false;
+ +                isDeviceArch11_ = false;
+ +            }
+ +
+ +            void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
+ +                        oclMat &status, oclMat *err = 0);
+ +
+ +            void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
+ +
+ +            Size winSize;
+ +            int maxLevel;
+ +            int iters;
+ +            double derivLambda;
+ +            bool useInitialFlow;
+ +            float minEigThreshold;
+ +            bool getMinEigenVals;
+ +
+ +            void releaseMemory()
+ +            {
+ +                dx_calcBuf_.release();
+ +                dy_calcBuf_.release();
+ +
+ +                prevPyr_.clear();
+ +                nextPyr_.clear();
+ +
+ +                dx_buf_.release();
+ +                dy_buf_.release();
+ +            }
+ +
+ +        private:
+ +            void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
+ +
+ +            void buildImagePyramid(const oclMat &img0, std::vector<oclMat> &pyr, bool withBorder);
+ +
+ +            oclMat dx_calcBuf_;
+ +            oclMat dy_calcBuf_;
+ +
+ +            std::vector<oclMat> prevPyr_;
+ +            std::vector<oclMat> nextPyr_;
+ +
+ +            oclMat dx_buf_;
+ +            oclMat dy_buf_;
+ +
+ +            oclMat uPyr_[2];
+ +            oclMat vPyr_[2];
+ +
+ +            bool isDeviceArch11_;
+ +        };
+ +        //////////////// build warping maps ////////////////////
+ +        //! builds plane warping maps
+ +        CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, const Mat &T, float scale, oclMat &map_x, oclMat &map_y);
+ +        //! builds cylindrical warping maps
+ +        CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
+ +        //! builds spherical warping maps
+ +        CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
+ +        //! builds Affine warping maps
+ +        CV_EXPORTS void buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
+ +
+ +        //! builds Perspective warping maps
+ +        CV_EXPORTS void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
+ +
+ +        ///////////////////////////////////// interpolate frames //////////////////////////////////////////////
+ +        //! Interpolate frames (images) using provided optical flow (displacement field).
+ +        //! frame0   - frame 0 (32-bit floating point images, single channel)
+ +        //! frame1   - frame 1 (the same type and size)
+ +        //! fu       - forward horizontal displacement
+ +        //! fv       - forward vertical displacement
+ +        //! bu       - backward horizontal displacement
+ +        //! bv       - backward vertical displacement
+ +        //! pos      - new frame position
+ +        //! newFrame - new frame
+ +        //! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat;
+ +        //!            occlusion masks            0, occlusion masks            1,
+ +        //!            interpolated forward flow  0, interpolated forward flow  1,
+ +        //!            interpolated backward flow 0, interpolated backward flow 1
+ +        //!
+ +        CV_EXPORTS void interpolateFrames(const oclMat &frame0, const oclMat &frame1,
+ +                                          const oclMat &fu, const oclMat &fv,
+ +                                          const oclMat &bu, const oclMat &bv,
+ +                                          float pos, oclMat &newFrame, oclMat &buf);
+ +
+ +        //! computes moments of the rasterized shape or a vector of points
+ +        CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage);
+ +
+ +        class CV_EXPORTS StereoBM_OCL
+ +        {
+ +        public:
+ +            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+ +
+ +            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
+ +
+ +            //! the default constructor
+ +            StereoBM_OCL();
+ +            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
+ +            StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
+ +
+ +            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
+ +            //! Output disparity has CV_8U type.
+ +            void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
+ +
+ +            //! Some heuristics that tries to estmate
+ +            // if current GPU will be faster then CPU in this algorithm.
+ +            // It queries current active device.
+ +            static bool checkIfGpuCallReasonable();
+ +
+ +            int preset;
+ +            int ndisp;
+ +            int winSize;
+ +
+ +            // If avergeTexThreshold  == 0 => post procesing is disabled
+ +            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
+ +            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
+ +            // i.e. input left image is low textured.
+ +            float avergeTexThreshold;
+ +        private:
+ +            oclMat minSSD, leBuf, riBuf;
+ +        };
+ +
+ +        class CV_EXPORTS StereoBeliefPropagation
+ +        {
+ +        public:
+ +            enum { DEFAULT_NDISP  = 64 };
+ +            enum { DEFAULT_ITERS  = 5  };
+ +            enum { DEFAULT_LEVELS = 5  };
+ +            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
+ +            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
+ +                                             int iters  = DEFAULT_ITERS,
+ +                                             int levels = DEFAULT_LEVELS,
+ +                                             int msg_type = CV_16S);
+ +            StereoBeliefPropagation(int ndisp, int iters, int levels,
+ +                                    float max_data_term, float data_weight,
+ +                                    float max_disc_term, float disc_single_jump,
+ +                                    int msg_type = CV_32F);
+ +            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+ +            void operator()(const oclMat &data, oclMat &disparity);
+ +            int ndisp;
+ +            int iters;
+ +            int levels;
+ +            float max_data_term;
+ +            float data_weight;
+ +            float max_disc_term;
+ +            float disc_single_jump;
+ +            int msg_type;
+ +        private:
+ +            oclMat u, d, l, r, u2, d2, l2, r2;
+ +            std::vector<oclMat> datas;
+ +            oclMat out;
+ +        };
+ +
+ +        class CV_EXPORTS StereoConstantSpaceBP
+ +        {
+ +        public:
+ +            enum { DEFAULT_NDISP    = 128 };
+ +            enum { DEFAULT_ITERS    = 8   };
+ +            enum { DEFAULT_LEVELS   = 4   };
+ +            enum { DEFAULT_NR_PLANE = 4   };
+ +            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
+ +            explicit StereoConstantSpaceBP(
+ +                int ndisp    = DEFAULT_NDISP,
+ +                int iters    = DEFAULT_ITERS,
+ +                int levels   = DEFAULT_LEVELS,
+ +                int nr_plane = DEFAULT_NR_PLANE,
+ +                int msg_type = CV_32F);
+ +            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
+ +                float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
+ +                int min_disp_th = 0,
+ +                int msg_type = CV_32F);
+ +            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+ +            int ndisp;
+ +            int iters;
+ +            int levels;
+ +            int nr_plane;
+ +            float max_data_term;
+ +            float data_weight;
+ +            float max_disc_term;
+ +            float disc_single_jump;
+ +            int min_disp_th;
+ +            int msg_type;
+ +            bool use_local_init_data_cost;
+ +        private:
+ +            oclMat u[2], d[2], l[2], r[2];
+ +            oclMat disp_selected_pyr[2];
+ +            oclMat data_cost;
+ +            oclMat data_cost_selected;
+ +            oclMat temp;
+ +            oclMat out;
+ +        };
+ +
+ +        // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
+ +        //
+ +        // see reference:
+ +        //   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+ +        //   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+ +        class CV_EXPORTS OpticalFlowDual_TVL1_OCL
+ +        {
+ +        public:
+ +            OpticalFlowDual_TVL1_OCL();
+ +
+ +            void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
+ +
+ +            void collectGarbage();
+ +
+ +            /**
+ +            * Time step of the numerical scheme.
+ +            */
+ +            double tau;
+ +
+ +            /**
+ +            * Weight parameter for the data term, attachment parameter.
+ +            * This is the most relevant parameter, which determines the smoothness of the output.
+ +            * The smaller this parameter is, the smoother the solutions we obtain.
+ +            * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
+ +            */
+ +            double lambda;
+ +
+ +            /**
+ +            * Weight parameter for (u - v)^2, tightness parameter.
+ +            * It serves as a link between the attachment and the regularization terms.
+ +            * In theory, it should have a small value in order to maintain both parts in correspondence.
+ +            * The method is stable for a large range of values of this parameter.
+ +            */
+ +            double theta;
+ +
+ +            /**
+ +            * Number of scales used to create the pyramid of images.
+ +            */
+ +            int nscales;
+ +
+ +            /**
+ +            * Number of warpings per scale.
+ +            * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
+ +            * This is a parameter that assures the stability of the method.
+ +            * It also affects the running time, so it is a compromise between speed and accuracy.
+ +            */
+ +            int warps;
+ +
+ +            /**
+ +            * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
+ +            * A small value will yield more accurate solutions at the expense of a slower convergence.
+ +            */
+ +            double epsilon;
+ +
+ +            /**
+ +            * Stopping criterion iterations number used in the numerical scheme.
+ +            */
+ +            int iterations;
+ +
+ +            bool useInitialFlow;
+ +
+ +        private:
+ +            void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
+ +
+ +            std::vector<oclMat> I0s;
+ +            std::vector<oclMat> I1s;
+ +            std::vector<oclMat> u1s;
+ +            std::vector<oclMat> u2s;
+ +
+ +            oclMat I1x_buf;
+ +            oclMat I1y_buf;
+ +
+ +            oclMat I1w_buf;
+ +            oclMat I1wx_buf;
+ +            oclMat I1wy_buf;
+ +
+ +            oclMat grad_buf;
+ +            oclMat rho_c_buf;
+ +
+ +            oclMat p11_buf;
+ +            oclMat p12_buf;
+ +            oclMat p21_buf;
+ +            oclMat p22_buf;
+ +
+ +            oclMat diff_buf;
+ +            oclMat norm_buf;
+ +        };
+ +    }
+ +}
+ +#if defined _MSC_VER && _MSC_VER >= 1200
+ +#  pragma warning( push)
+ +#  pragma warning( disable: 4267)
+ +#endif
+ +#include "opencv2/ocl/matrix_operations.hpp"
+ +#if defined _MSC_VER && _MSC_VER >= 1200
+ +#  pragma warning( pop)
+ +#endif
+ +
+ +#endif /* __OPENCV_OCL_HPP__ */
diff --cc modules/ocl/include/opencv2/ocl/private/util.hpp

index 355e3b5,634f2f2..9daec2e
--- 1/modules/ocl/include/opencv2/ocl/private/util.hpp
--- 2/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@@ -46,8 -46,10 +46,8 @@@
   #ifndef __OPENCV_OCL_PRIVATE_UTIL__
   #define __OPENCV_OCL_PRIVATE_UTIL__
   
- -#include "opencv2/ocl/ocl.hpp"
- -
   #if defined __APPLE__
- #include <OpenCL/OpenCL.h>
+ #include <OpenCL/opencl.h>
   #else
   #include <CL/opencl.h>
   #endif
diff --cc modules/ocl/perf/perf_canny.cpp
Simple merge
diff --cc modules/ocl/perf/perf_color.cpp

index 1faef40,daf1cfd..44dc8f8
--- 1/modules/ocl/perf/perf_color.cpp
--- 2/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
@@@ -70,22 -70,20 +70,20 @@@ PERFTEST(cvtColor
               d_src.upload(src);
   
               WARMUP_ON;
- -            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+ +            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
               WARMUP_OFF;
   
-             cv::Mat ocl_mat;
-             d_dst.download(ocl_mat);
-             TestSystem::instance().setAccurate(ExceptedMatSimilar(dst, ocl_mat, 1e-5));
- 
               GPU_ON;
- -            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+ +            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
               GPU_OFF;
   
               GPU_FULL_ON;
               d_src.upload(src);
- -            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+ +            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
-             d_dst.download(dst);
+             d_dst.download(ocl_dst);
               GPU_FULL_OFF;
+ 
+             TestSystem::instance().ExceptedMatSimilar(dst, ocl_dst, 1e-5);
           }
   
   
diff --cc modules/ocl/perf/perf_haar.cpp
Simple merge
diff --cc modules/ocl/perf/perf_imgproc.cpp

index 9bba2b0,18c7429..ccfd18c
--- 1/modules/ocl/perf/perf_imgproc.cpp
--- 2/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@@ -1044,11 -1029,13 +1029,13 @@@ PERFTEST(meanShiftProc
   
           GPU_FULL_ON;
           d_src.upload(src);
-         ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-         d_dst.download(dst);
-         d_dstCoor_roi.download(dstCoor_roi);
+         ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
+         d_dst.download(ocl_dst[0]);
+         d_dstCoor.download(ocl_dst[1]);
           GPU_FULL_OFF;
   
- -        TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);      
+         vector<double> eps(2, 0.);
++        TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
       }
   }
   
diff --cc modules/ocl/perf/perf_match_template.cpp

index 396cb0b,5da15aa..1330e26
--- 1/modules/ocl/perf/perf_match_template.cpp
--- 2/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@@ -76,32 -74,30 +74,30 @@@ PERFTEST(matchTemplate
   
                   gen(templ, templ_size, templ_size, all_type[j], 0, 1);
   
- -                matchTemplate(src, templ, dst, CV_TM_CCORR);
+ +                matchTemplate(src, templ, dst, TM_CCORR);
   
                   CPU_ON;
- -                matchTemplate(src, templ, dst, CV_TM_CCORR);
+ +                matchTemplate(src, templ, dst, TM_CCORR);
                   CPU_OFF;
   
-                 ocl::oclMat d_src(src), d_templ, d_dst;
- 
-                 d_templ.upload(templ);
+                 ocl::oclMat d_src(src), d_templ(templ), d_dst;
   
                   WARMUP_ON;
- -                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+ +                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
                   WARMUP_OFF;
   
-                 TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
- 
                   GPU_ON;
- -                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+ +                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
                   GPU_OFF;
   
                   GPU_FULL_ON;
                   d_src.upload(src);
                   d_templ.upload(templ);
- -                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+ +                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-                 d_dst.download(dst);
+                 d_dst.download(ocl_dst);
                   GPU_FULL_OFF;
+ 
+                 TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
               }
           }
   
@@@ -128,21 -124,21 +124,21 @@@
                   ocl::oclMat d_templ(templ), d_dst;
   
                   WARMUP_ON;
- -                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+ +                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
                   WARMUP_OFF;
   
-                 TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
- 
                   GPU_ON;
- -                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+ +                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
                   GPU_OFF;
   
                   GPU_FULL_ON;
                   d_src.upload(src);
                   d_templ.upload(templ);
- -                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+ +                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
-                 d_dst.download(dst);
+                 d_dst.download(ocl_dst);
                   GPU_FULL_OFF;
+ 
+                 TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
               }
           }
       }
diff --cc modules/ocl/perf/precomp.cpp

index 65f8f8a,71a13a1..65e2d51
--- 1/modules/ocl/perf/precomp.cpp
--- 2/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@@ -414,9 -413,12 +413,12 @@@ void TestSystem::writeMetrics(double cp
           exit(-1);
       }
   
-     fprintf(record_, "%s,%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
- -    fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", 
++    fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n",
+         itname_changed_ ? itname_.c_str() : "",
           cur_subtest_description_.str().c_str(),
-         _is_accurate_.c_str(), cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
- -        _is_accurate_.c_str(), 
++        _is_accurate_.c_str(),
+         accurate_diff_,
+         cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
           gpu_min, gpu_max, std_dev);
   
       if (itname_changed_)
@@@ -616,34 -499,10 +490,6 @@@ double checkNorm(const Mat &m1, const M
   double checkSimilarity(const Mat &m1, const Mat &m2)
   {
       Mat diff;
- -    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
+ +    matchTemplate(m1, m2, diff, TM_CCORR_NORMED);
       return std::abs(diff.at<float>(0, 0) - 1.f);
   }
- 
--
- int ExpectedMatNear(cv::Mat dst, cv::Mat cpu_dst, double eps)
- {
-     assert(dst.type() == cpu_dst.type());
-     assert(dst.size() == cpu_dst.size());
-     if(checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) < eps ||checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) == eps)
-         return 1;
-     return 0;
- }
--
- int ExceptDoubleNear(double val1, double val2, double abs_error)
- {
-     const double diff = fabs(val1 - val2);
-     if (diff <= abs_error)
-         return 1;
--
-     return 0;
- }
--
- int ExceptedMatSimilar(cv::Mat dst, cv::Mat cpu_dst, double eps)
- {
-     assert(dst.type() == cpu_dst.type());
-     assert(dst.size() == cpu_dst.size());
-     if(checkSimilarity(cv::Mat(cpu_dst), cv::Mat(dst)) <= eps)
-         return 1;
-     return 0;
- }
diff --cc modules/ocl/perf/precomp.hpp

index 2212939,97e3d7e..385320b
--- 1/modules/ocl/perf/precomp.hpp
--- 2/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
@@@ -322,9 -313,46 +322,46 @@@ public
           itname_changed_ = true;
       }
   
-     void setAccurate(int is_accurate = -1)
+     void setAccurate(int accurate, double diff)
       {
-         is_accurate_ = is_accurate;
+         is_accurate_ = accurate;
+         accurate_diff_ = diff;
+     }
+ 
+     void ExpectMatsNear(vector<Mat>& dst, vector<Mat>& cpu_dst, vector<double>& eps)
+     {
+         assert(dst.size() == cpu_dst.size());
+         assert(cpu_dst.size() == eps.size());
+         is_accurate_ = 1;
+         for(size_t i=0; i<dst.size(); i++)
+         {
+             double cur_diff = checkNorm(dst[i], cpu_dst[i]);
+             accurate_diff_ = max(accurate_diff_, cur_diff);
+             if(cur_diff > eps[i])
+                 is_accurate_ = 0;
+         }
+     }
+ 
+     void ExpectedMatNear(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
+     {
+         assert(dst.type() == cpu_dst.type());
+         assert(dst.size() == cpu_dst.size());
+         accurate_diff_ = checkNorm(dst, cpu_dst);
+         if(accurate_diff_ <= eps)
+             is_accurate_ = 1;
+         else
+             is_accurate_ = 0;
+     }
+ 
+     void ExceptedMatSimilar(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
+     {
+         assert(dst.type() == cpu_dst.type());
+         assert(dst.size() == cpu_dst.size());
+         accurate_diff_ = checkSimilarity(cpu_dst, dst);
+         if(accurate_diff_ <= eps)
+             is_accurate_ = 1;
+         else
- -            is_accurate_ = 0;    
++            is_accurate_ = 0;
       }
   
       std::stringstream &getCurSubtestDescription()
@@@ -341,8 -369,8 +378,8 @@@ private
           speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
           num_iters_(10), cpu_num_iters_(2),
           gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
- -        record_(0), recordname_("performance"), itname_changed_(true), 
+ +        record_(0), recordname_("performance"), itname_changed_(true),
-         is_accurate_(-1)
+         is_accurate_(-1), accurate_diff_(0.)
       {
           cpu_times_.reserve(num_iters_);
           gpu_times_.reserve(num_iters_);
diff --cc modules/ocl/src/arithm.cpp

index d425344,49a56ce..1778c98
--- 1/modules/ocl/src/arithm.cpp
--- 2/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@@ -401,23 -402,23 +401,23 @@@ static void arithmetic_scalar_run(cons
                                 };
   
       int dst_step1 = dst.cols * dst.elemSize();
- -    vector<pair<size_t , const void *> > args;
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+ +    std::vector<std::pair<size_t , const void *> > args;
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
   
+     float f_scalar = (float)scalar;
       if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
- -        args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
+ +        args.push_back( std::make_pair( sizeof(cl_double), (void *)&scalar ));
       else
       {
-         float f_scalar = (float)scalar;
- -        args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar));
+ +        args.push_back( std::make_pair( sizeof(cl_float), (void *)&f_scalar));
       }
   
       openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
@@@ -1674,25 -1683,26 +1684,26 @@@ void bitwise_run(const oclMat &src1, co
                                 };
   
       int dst_step1 = dst.cols * dst.elemSize();
- -    vector<pair<size_t , const void *> > args;
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+ +    std::vector<std::pair<size_t , const void *> > args;
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
   
+     T scalar;
       if(_scalar != NULL)
       {
           double scalar1 = *((double *)_scalar);
-         T scalar = (T)scalar1;
+         scalar = (T)scalar1;
- -        args.push_back( make_pair( sizeof(T), (void *)&scalar ));
+ +        args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
       }
   
       openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, _opt);
@@@ -2298,23 -2308,23 +2309,23 @@@ static void arithmetic_pow_run(const oc
                                 };
   
       int dst_step1 = dst.cols * dst.elemSize();
- -    vector<pair<size_t , const void *> > args;
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+ +    std::vector<std::pair<size_t , const void *> > args;
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+     float pf = p;
       if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
       {
-         float pf = p;
- -        args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
+ +        args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
       }
       else
- -        args.push_back( make_pair( sizeof(cl_double), (void *)&p ));
+ +        args.push_back( std::make_pair( sizeof(cl_double), (void *)&p ));
   
       openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
   }
diff --cc modules/ocl/src/brute_force_matcher.cpp
Simple merge
diff --cc modules/ocl/src/canny.cpp

index e06d299,82bb01b..abe98cf
--- 1/modules/ocl/src/canny.cpp
--- 2/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@@ -330,10 -325,10 +322,10 @@@ void canny::calcMap_gpu(oclMat &dx, ocl
   
   
       size_t globalThreads[3] = {cols, rows, 1};
- -    string kernelName = "calcMap";
+ +    String kernelName = "calcMap";
       size_t localThreads[3]  = {16, 16, 1};
   
-     openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+     openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
   }
   
   void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
@@@ -373,17 -368,17 +365,17 @@@ void canny::edgesHysteresisGlobal_gpu(o
   
           args.clear();
           size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
- -        args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
- -        args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
- -        args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
- -        args.push_back( make_pair( sizeof(cl_mem), (void *)&counter));
- -        args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
- -        args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
- -        args.push_back( make_pair( sizeof(cl_int), (void *)&count));
- -        args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
- -        args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+ +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
+ +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
+ +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
+ +        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
+ +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
+ +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
+ +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&count));
+ +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
+ +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
   
-         openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
+         openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
           openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
           std::swap(st1, st2);
       }
diff --cc modules/ocl/src/filtering.cpp

index a98443d,56a70ae..d7b6267
--- 1/modules/ocl/src/filtering.cpp
--- 2/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@@ -354,23 -354,22 +354,22 @@@ static void GPUDilate(const oclMat &src
       }
   
       char compile_option[128];
- -    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s", 
- -        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], 
+ +    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s",
+ +        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
-         rectKernel?"-D RECTKERNEL":"",
-         s);
+         s, rectKernel?"-D RECTKERNEL":"");
- -    vector< pair<size_t, const void *> > args;
- -    args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
- -    args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_x));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&srcOffset_y));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
- -    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
- -    args.push_back(make_pair(sizeof(cl_int), (void *)&dstOffset));
+ +    std::vector< std::pair<size_t, const void *> > args;
+ +    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
+ +    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcOffset_x));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcOffset_y));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcStep));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstStep));
+ +    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
+ +    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstOffset));
       openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
   }
   
diff --cc modules/ocl/src/gfft.cpp

index 0000000,af7580b..c9376f9

mode 000000,100644..100644
--- /dev/null
--- 2/modules/ocl/src/gfft.cpp
+++ b/modules/ocl/src/gfft.cpp
@@@ -1,0 -1,351 +1,349 @@@
- -        std::string kernelname = "sortCorners_bitonicSort";
+ /*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+ // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // @Authors
+ //    Peng Xiao, pengxiao@outlook.com
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and/or other oclMaterials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors as is and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+ #include <iomanip>
+ #include "precomp.hpp"
+ 
+ using namespace cv;
+ using namespace cv::ocl;
+ 
+ static bool use_cpu_sorter = true;
+ 
+ namespace cv
+ {
+     namespace ocl
+     {
+         ///////////////////////////OpenCL kernel strings///////////////////////////
+         extern const char *imgproc_gfft;
+     }
+ }
+ 
+ namespace
+ {
+ enum SortMethod
+ {
+     CPU_STL,
+     BITONIC,
+     SELECTION
+ };
+ 
+ const int GROUP_SIZE = 256;
+ 
+ template<SortMethod method>
+ struct Sorter
+ {
+     //typedef EigType;
+ };
+ 
+ //TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
+ template<>
+ struct Sorter<CPU_STL>
+ {
+     typedef oclMat EigType;
+     static cv::Mutex cs;
+     static Mat mat_eig;
+ 
+     //prototype
+     static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
+     {
+         float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
+         float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
+         return v1 > v2;
+     }
+     static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+     {
+         cv::AutoLock lock(cs);
+         //temporarily use STL's sort function
+         Mat mat_corners = corners;
+         mat_eig = eig_tex;
+         std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
+         corners = mat_corners;
+     }
+ };
+ cv::Mutex Sorter<CPU_STL>::cs;
+ cv::Mat   Sorter<CPU_STL>::mat_eig;
+ 
+ template<>
+ struct Sorter<BITONIC>
+ {
+     typedef TextureCL EigType;
+ 
+     static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+     {
+         Context * cxt = Context::getContext();
+         size_t globalThreads[3] = {count / 2, 1, 1};
+         size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+ 
+         // 2^numStages should be equal to count or the output is invalid
+         int numStages = 0;
+         for(int i = count; i > 1; i >>= 1)
+         {
+             ++numStages;
+         }
+         const int argc = 5;
+         std::vector< std::pair<size_t, const void *> > args(argc);
- -        
++        String kernelname = "sortCorners_bitonicSort";
+         args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
+         args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+         args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
+         for(int stage = 0; stage < numStages; ++stage)
+         {
+             args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+             for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+             {
+                 args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+                 openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+             }
+         }
+     }
+ };
+ 
+ template<>
+ struct Sorter<SELECTION>
+ {
+     typedef TextureCL EigType;
+ 
+     static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+     {
+         Context * cxt = Context::getContext();
- -        std::string kernelname = "sortCorners_selectionSortLocal";
++
+         size_t globalThreads[3] = {count, 1, 1};
+         size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+ 
+         std::vector< std::pair<size_t, const void *> > args;
+         //local
- -    const TextureCL& eig, 
++        String kernelname = "sortCorners_selectionSortLocal";
+         int lds_size = GROUP_SIZE * sizeof(cl_float2);
+         args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
+         args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
+         args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
+         args.push_back( std::make_pair( lds_size,       (void*)NULL) );
+ 
+         openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+ 
+         //final
+         kernelname = "sortCorners_selectionSortFinal";
+         args.pop_back();
+         openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+     }
+ };
+ 
+ int findCorners_caller(
- -    std::string kernelname = "findCorners";
++    const TextureCL& eig,
+     const float threshold,
+     const oclMat& mask,
+     oclMat& corners,
+     const int max_count)
+ {
+     std::vector<int> k;
+     Context * cxt = Context::getContext();
+ 
+     std::vector< std::pair<size_t, const void*> > args;
- -    args.push_back(make_pair( sizeof(cl_mem),   (void*)&eig  ));
- -    args.push_back(make_pair( sizeof(cl_mem),   (void*)&mask.data ));
- -    args.push_back(make_pair( sizeof(cl_mem),   (void*)&corners.data ));
- -    args.push_back(make_pair( sizeof(cl_int),   (void*)&mask_strip));
- -    args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
- -    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
- -    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
- -    args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
- -    args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
++    String kernelname = "findCorners";
+ 
+     const int mask_strip = mask.step / mask.elemSize1();
+ 
+     oclMat g_counter(1, 1, CV_32SC1);
+     g_counter.setTo(0);
+ 
- -    
++    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eig  ));
++    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&mask.data ));
++    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&corners.data ));
++    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&mask_strip));
++    args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
++    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
++    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
++    args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
++    args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+ 
+     size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+     size_t localThreads[3]  = {16, 16, 1};
+ 
+     const char * opt = mask.empty() ? "" : "-D WITH_MASK";
+     openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
+     return std::min(Mat(g_counter).at<int>(0), max_count);
+ }
+ }//unnamed namespace
+ 
+ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
+ {
+     CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
+     CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+ 
+     CV_DbgAssert(support_image2d());
+ 
+     ensureSizeIsEnough(image.size(), CV_32F, eig_);
+ 
+     if (useHarrisDetector)
+         cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+     else
+         cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
+ 
+     double maxVal = 0;
+     minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
+ 
+     ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+ 
+     Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
+     int total = findCorners_caller(
+         *eig_tex,
+         static_cast<float>(maxVal * qualityLevel),
+         mask,
+         tmpCorners_,
+         tmpCorners_.cols);
+ 
+     if (total == 0)
+     {
+         corners.release();
+         return;
+     }
+     if(use_cpu_sorter)
+     {
+         Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
+     }
+     else
+     {
+         //if total is power of 2
+         if(((total - 1) & (total)) == 0)
+         {
+             Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+         }
+         else
+         {
+             Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+         }
+     }
- -        vector<Point2f> tmp(total);
++
+     if (minDistance < 1)
+     {
+         corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
+     }
+     else
+     {
- -        vector<Point2f> tmp2;
++        std::vector<Point2f> tmp(total);
+         downloadPoints(tmpCorners_, tmp);
+ 
- -                    vector<Point2f>& m = grid[yy * grid_width + xx];
++        std::vector<Point2f> tmp2;
+         tmp2.reserve(total);
+ 
+         const int cell_size = cvRound(minDistance);
+         const int grid_width = (image.cols + cell_size - 1) / cell_size;
+         const int grid_height = (image.rows + cell_size - 1) / cell_size;
+ 
+         std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+ 
+         for (int i = 0; i < total; ++i)
+         {
+             Point2f p = tmp[i];
+ 
+             bool good = true;
+ 
+             int x_cell = static_cast<int>(p.x / cell_size);
+             int y_cell = static_cast<int>(p.y / cell_size);
+ 
+             int x1 = x_cell - 1;
+             int y1 = y_cell - 1;
+             int x2 = x_cell + 1;
+             int y2 = y_cell + 1;
+ 
+             // boundary check
+             x1 = std::max(0, x1);
+             y1 = std::max(0, y1);
+             x2 = std::min(grid_width - 1, x2);
+             y2 = std::min(grid_height - 1, y2);
+ 
+             for (int yy = y1; yy <= y2; yy++)
+             {
+                 for (int xx = x1; xx <= x2; xx++)
+                 {
- -void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
++                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
+ 
+                     if (!m.empty())
+                     {
+                         for(size_t j = 0; j < m.size(); j++)
+                         {
+                             float dx = p.x - m[j].x;
+                             float dy = p.y - m[j].y;
+ 
+                             if (dx * dx + dy * dy < minDistance * minDistance)
+                             {
+                                 good = false;
+                                 goto break_out;
+                             }
+                         }
+                     }
+                 }
+             }
+ 
+             break_out:
+ 
+             if(good)
+             {
+                 grid[y_cell * grid_width + x_cell].push_back(p);
+ 
+                 tmp2.push_back(p);
+ 
+                 if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
+                     break;
+             }
+         }
+ 
+         corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+     }
+ }
- -        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()), 
- -        reinterpret_cast<cl_mem>(points.data), 
- -        CL_TRUE,                                    
- -        0, 
- -        points.cols * sizeof(Point2f), 
- -        &points_v[0], 
- -        0, 
- -        NULL, 
++void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
+ {
+     CV_DbgAssert(points.type() == CV_32FC2);
+     points_v.resize(points.cols);
+     openCLSafeCall(clEnqueueReadBuffer(
- -
- -
++        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
++        reinterpret_cast<cl_mem>(points.data),
++        CL_TRUE,
++        0,
++        points.cols * sizeof(Point2f),
++        &points_v[0],
++        0,
++        NULL,
+         NULL));
+ }
diff --cc modules/ocl/src/haar.cpp

index 09e1816,565270c..8fb6956
--- 1/modules/ocl/src/haar.cpp
--- 2/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@@ -1025,25 -861,27 +860,27 @@@ CvSeq *cv::ocl::OclCascadeClassifier::o
           pq.s[3] = gcascade->pq3;
           float correction = gcascade->inv_window_area;
   
- -        vector<pair<size_t, const void *> > args;
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
- -        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
- -        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
- -        args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
+ +        std::vector<std::pair<size_t, const void *> > args;
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&pixelstep ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&p ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
+ +        args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
   
-         openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
+         const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+ 
+         openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
   
           openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
   
@@@ -1149,15 -989,14 +986,14 @@@
               int startnodenum = nodenum * i;
               float factor2 = (float)factor;
   
- -            vector<pair<size_t, const void *> > args1;
- -            args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
- -            args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
- -            args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 ));
- -            args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] ));
- -            args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
+ +            std::vector<std::pair<size_t, const void *> > args1;
+ +            args1.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
+ +            args1.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
+ +            args1.push_back ( std::make_pair(sizeof(cl_float) , (void *)&factor2 ));
+ +            args1.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction[i] ));
+ +            args1.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnodenum ));
   
               size_t globalThreads2[3] = {nodenum, 1, 1};
- 
               openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
           }
   
@@@ -1174,26 -1013,26 +1010,26 @@@
           correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
           openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
   
- -        vector<pair<size_t, const void *> > args;
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
- -        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
- -        args.push_back ( make_pair(sizeof(cl_int) , (void *)&nodenum ));
- -        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+ +        std::vector<std::pair<size_t, const void *> > args;
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&step ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
+ +        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));
+ +
-         openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
+         openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
   
           candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
   
@@@ -1814,145 -1683,3 +1680,5 @@@ void cv::ocl::OclCascadeClassifierBuf::
   #ifndef _MAX_PATH
   #define _MAX_PATH 1024
   #endif
- 
- /****************************************************************************************\
- *                                  Persistence functions                                 *
- \****************************************************************************************/
- 
- /* field names */
- 
- #define ICV_HAAR_SIZE_NAME            "size"
- #define ICV_HAAR_STAGES_NAME          "stages"
- #define ICV_HAAR_TREES_NAME             "trees"
- #define ICV_HAAR_FEATURE_NAME             "feature"
- #define ICV_HAAR_RECTS_NAME                 "rects"
- #define ICV_HAAR_TILTED_NAME                "tilted"
- #define ICV_HAAR_THRESHOLD_NAME           "threshold"
- #define ICV_HAAR_LEFT_NODE_NAME           "left_node"
- #define ICV_HAAR_LEFT_VAL_NAME            "left_val"
- #define ICV_HAAR_RIGHT_NODE_NAME          "right_node"
- #define ICV_HAAR_RIGHT_VAL_NAME           "right_val"
- #define ICV_HAAR_STAGE_THRESHOLD_NAME   "stage_threshold"
- #define ICV_HAAR_PARENT_NAME            "parent"
- #define ICV_HAAR_NEXT_NAME              "next"
- 
- static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, CvPoint pt, int start_stage */)
- {
-     return 1;
- }
- 
- namespace cv
- {
- namespace ocl
- {
- 
- struct gpuHaarDetectObjects_ScaleImage_Invoker
- {
-     gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade,
-             int _stripSize, double _factor,
-             const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1,
-             Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec )
-     {
-         cascade = _cascade;
-         stripSize = _stripSize;
-         factor = _factor;
-         sum1 = _sum1;
-         sqsum1 = _sqsum1;
-         norm1 = _norm1;
-         mask1 = _mask1;
-         equRect = _equRect;
-         vec = &_vec;
-     }
- 
-     void operator()( const BlockedRange &range ) const
-     {
-         Size winSize0 = cascade->orig_window_size;
-         Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor));
-         int y1 = range.begin() * stripSize, y2 = std::min(range.end() * stripSize, sum1.rows - 1 - winSize0.height);
-         Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
-         int x, y, ystep = factor > 2 ? 1 : 2;
- 
-         for( y = y1; y < y2; y += ystep )
-             for( x = 0; x < ssz.width; x += ystep )
-             {
-                 if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
-                     vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
-                                         winSize.width, winSize.height));
-             }
-     }
- 
-     const CvHaarClassifierCascade *cascade;
-     int stripSize;
-     double factor;
-     Mat sum1, sqsum1, *norm1, *mask1;
-     Rect equRect;
-     ConcurrentRectVector *vec;
- };
- 
- 
- struct gpuHaarDetectObjects_ScaleCascade_Invoker
- {
-     gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade,
-             Size _winsize, const Range &_xrange, double _ystep,
-             size_t _sumstep, const int **_p, const int **_pq,
-             ConcurrentRectVector &_vec )
-     {
-         cascade = _cascade;
-         winsize = _winsize;
-         xrange = _xrange;
-         ystep = _ystep;
-         sumstep = _sumstep;
-         p = _p;
-         pq = _pq;
-         vec = &_vec;
-     }
- 
-     void operator()( const BlockedRange &range ) const
-     {
-         int iy, startY = range.begin(), endY = range.end();
-         const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3];
-         const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3];
-         bool doCannyPruning = p0 != 0;
-         int sstep = (int)(sumstep / sizeof(p0[0]));
- 
-         for( iy = startY; iy < endY; iy++ )
-         {
-             int ix, y = cvRound(iy * ystep), ixstep = 1;
-             for( ix = xrange.start; ix < xrange.end; ix += ixstep )
-             {
-                 int x = cvRound(ix * ystep); // it should really be ystep, not ixstep
- 
-                 if( doCannyPruning )
-                 {
-                     int offset = y * sstep + x;
-                     int s = p0[offset] - p1[offset] - p2[offset] + p3[offset];
-                     int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset];
-                     if( s < 100 || sq < 20 )
-                     {
-                         ixstep = 2;
-                         continue;
-                     }
-                 }
- 
-                 int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
-                 if( result > 0 )
-                     vec->push_back(Rect(x, y, winsize.width, winsize.height));
-                 ixstep = result != 0 ? 1 : 2;
-             }
-         }
-     }
- 
-     const CvHaarClassifierCascade *cascade;
-     double ystep;
-     size_t sumstep;
-     Size winsize;
-     Range xrange;
-     const int **p;
-     const int **pq;
-     ConcurrentRectVector *vec;
- };
- 
- }
- }
+ +
+ +#endif
diff --cc modules/ocl/src/imgproc.cpp

index d703a61,ef48b8e..4b8fe58
--- 1/modules/ocl/src/imgproc.cpp
--- 2/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@@ -270,31 -269,30 +270,31 @@@ namespace c
               size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
               size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
   
- 
+             float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
- -            vector< pair<size_t, const void *> > args;
+ +            std::vector< std::pair<size_t, const void *> > args;
               if(map1.channels() == 2)
               {
- -                args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
- -                args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
- -                args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
- -                args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
- -                
+ +                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
+ +                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
+ +                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&map1.data));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.offset));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.offset));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.step));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.step));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.cols));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.rows));
+ +                args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
+ +                float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
+ +
-                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                   {
- -                    args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
+ +                    args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
                   }
                   else
                   {
@@@ -326,8 -324,7 +326,7 @@@
                   }
                   else
                   {
-                     float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
- -                    args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
+ +                    args.push_back( std::make_pair(sizeof(cl_float4), (void *)&borderFloat));
                   }
               }
               openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
@@@ -1210,30 -1205,41 +1209,41 @@@
           void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
                             double k, int borderType)
           {
+             oclMat dx, dy;
+             cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
+         }
+ 
+         void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
+                           double k, int borderType)
+         {
               if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
               {
- -                CV_Error(CV_GpuNotSupported, "select device don't support double");
+ +                CV_Error(Error::GpuNotSupported, "select device don't support double");
               }
               CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-             oclMat Dx, Dy;
               CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-             extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+             extractCovData(src, dx, dy, blockSize, ksize, borderType);
               dst.create(src.size(), CV_32F);
-             corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
+             corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
           }
   
           void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
           {
- -        
+             oclMat dx, dy;
+             cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
+         }
++
+         void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
+         {
               if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
               {
- -                CV_Error(CV_GpuNotSupported, "select device don't support double");
+ +                CV_Error(Error::GpuNotSupported, "select device don't support double");
               }
               CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-             oclMat Dx, Dy;
               CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-             extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+             extractCovData(src, dx, dy, blockSize, ksize, borderType);
               dst.create(src.size(), CV_32F);
-             corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
+             corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
           }
           /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
           static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
diff --cc modules/ocl/src/mcwutil.cpp
Simple merge
diff --cc modules/ocl/src/opencl/haarobjectdetect.cl

index 833fb3c,4873298..22a7fe7
--- 1/modules/ocl/src/opencl/haarobjectdetect.cl
--- 2/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@@ -45,20 -46,9 +46,9 @@@
   typedef int   sumtype;
   typedef float sqsumtype;
   
- typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
- {
-     struct __attribute__((aligned (32)))
- {
-     int p0 __attribute__((aligned (4)));
-     int p1 __attribute__((aligned (4)));
-     int p2 __attribute__((aligned (4)));
-     int p3 __attribute__((aligned (4)));
-     float weight __attribute__((aligned (4)));
- }
- rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
- }
- GpuHidHaarFeature;
- 
- -#ifndef STUMP_BASED 
++#ifndef STUMP_BASED
+ #define STUMP_BASED 1
+ #endif
   
   typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
   {
@@@ -301,12 -317,14 +317,14 @@@ __kernel void __attribute__((reqd_work_
   
                       if(lcl_compute_win_id < queuecount)
                       {
- 
                           int tempnodecounter = lcl_compute_id;
                           float part_sum = 0.f;
-                         for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
+                         const int stump_factor = STUMP_BASED ? 1 : 2;
+                         int root_offset = 0;
+                         for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
                           {
-                             __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
- -                            __global GpuHidHaarTreeNode* currentnodeptr = 
++                            __global GpuHidHaarTreeNode* currentnodeptr =
+                                 nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
   
                               int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                               int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
@@@ -377,155 -421,5 +421,3 @@@
           }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
       }//end for(int scalei = 0; scalei <loopcount; scalei++)
   }
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- 
- /*
- if(stagecascade->two_rects)
- {
-     #pragma unroll
-     for( n = 0; n < stagecascade->count; n++ )
-     {
-         t1 = *(node + counter);
-         t = t1.threshold * variance_norm_factor;
-         classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
- 
-         classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
-         stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
- 
-         counter++;
-     }
- }
- else
- {
-     #pragma unroll
-     for( n = 0; n < stagecascade->count; n++ )
-     {
-         t = node[counter].threshold*variance_norm_factor;
-         classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
-         classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
- 
-         if( node[counter].p0[2] )
-             classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
- 
-         stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
- 
-         counter++;
-     }
- }
- */
- /*
- __kernel void gpuRunHaarClassifierCascade_ScaleWindow(
-                           constant GpuHidHaarClassifierCascade * _cascade,
-                           global GpuHidHaarStageClassifier * stagecascadeptr,
-                           //global GpuHidHaarClassifier * classifierptr,
-                           global GpuHidHaarTreeNode * nodeptr,
-                           global int * sum,
-                           global float * sqsum,
-                           global int * _candidate,
-                           int pixel_step,
-                           int cols,
-                           int rows,
-                           int start_stage,
-                           int end_stage,
-                           //int counts,
-                           int nodenum,
-                           int ystep,
-                           int detect_width,
-                           //int detect_height,
-                           int loopcount,
-                           int outputstep)
-                           //float scalefactor)
- {
- unsigned int x1 = get_global_id(0);
- unsigned int y1 = get_global_id(1);
- int p_offset;
- int m, n;
- int result;
- int counter;
- float mean, variance_norm_factor;
- for(int i=0;i<loopcount;i++)
- {
- constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
- global int * candidate = _candidate + i*outputstep;
- int window_width = cascade->p1 - cascade->p0;
- int window_height = window_width;
- result = 1;
- counter = 0;
- unsigned int x = mul24(x1,ystep);
- unsigned int y = mul24(y1,ystep);
- if((x < cols - window_width - 1) && (y < rows - window_height -1))
- {
- global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
- //global GpuHidHaarClassifier      *classifier   = classifierptr;
- global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;
- 
- p_offset = mad24(y, pixel_step, x);// modify
- 
- mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
-     *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
-     *cascade->inv_window_area;
- 
- variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
-                     *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
- variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
- variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
- 
- // if( cascade->is_stump_based )
- //{
- for( m = start_stage; m < end_stage; m++ )
- {
- float stage_sum = 0.f;
- float t,  classsum;
- GpuHidHaarTreeNode t1;
- 
- //#pragma unroll
- for( n = 0; n < stagecascade->count; n++ )
- {
-      t1 = *(node + counter);
-      t  = t1.threshold * variance_norm_factor;
-      classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
- 
-      if((t1.p0[2]) && (!stagecascade->two_rects))
-          classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
- 
-      stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
-      counter++;
- }
- 
- if (stage_sum < stagecascade->threshold)
- {
-     result = 0;
-     break;
- }
--
- stagecascade++;
--
- }
- if(result)
- {
-     candidate[4 * (y1 * detect_width + x1)]     = x;
-     candidate[4 * (y1 * detect_width + x1) + 1] = y;
-     candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
-     candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
- }
- //}
- }
- }
- }
- */
diff --cc modules/ocl/src/precomp.hpp

index 76d1551,4f93eac..3b75f30
--- 1/modules/ocl/src/precomp.hpp
--- 2/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@@@ -61,14 -65,14 +61,16 @@@
   #include <exception>
   #include <stdio.h>
   
- -#include "opencv2/imgproc/imgproc.hpp"
- -#include "opencv2/imgproc/imgproc_c.h"
- -#include "opencv2/core/core_c.h"
- -#include "opencv2/objdetect/objdetect.hpp"
- -#include "opencv2/ocl/ocl.hpp"
++#undef OPENCV_NOSTL
++
+ +#include "opencv2/imgproc.hpp"
+ +#include "opencv2/objdetect.hpp"
+ +#include "opencv2/ocl.hpp"
   
- -#include "opencv2/core/internal.hpp"
- -//#include "opencv2/highgui/highgui.hpp"
+ +#include "opencv2/core/utility.hpp"
+ +#include "opencv2/core/private.hpp"
+ +
+ +//#include "opencv2/highgui.hpp"
   
   #define __ATI__
   
diff --cc modules/ocl/src/pyrlk.cpp

index feb84c7,a3e65dd..e95729c
--- 1/modules/ocl/src/pyrlk.cpp
--- 2/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@@ -15,8 -15,8 +15,8 @@@
   // Third party copyrights are property of their respective owners.
   //
   // @Authors
- //            Dachuan Zhao, dachuan@multicorewareinc.com
- //            Yao Wang, yao@multicorewareinc.com
+ //      Dachuan Zhao, dachuan@multicorewareinc.com
- -//      Yao Wang, bitwangyaoyao@gmail.com
++//      Yao Wang, yao@multicorewareinc.com
   //      Nathan, liujun@multicorewareinc.com
   //
   // Redistribution and use in source and binary forms, with or without modification,
@@@ -54,17 -56,10 +54,14 @@@ namespace c
   {
   namespace ocl
   {
- ///////////////////////////OpenCL kernel strings///////////////////////////
   extern const char *pyrlk;
   extern const char *pyrlk_no_image;
- extern const char *arithm_mul;
+ +extern const char *operator_setTo;
+ +extern const char *operator_convertTo;
+ +extern const char *operator_copyToM;
+ +extern const char *pyr_down;
   }
   }
- 
   struct dim3
   {
       unsigned int x, y, z;
@@@ -102,472 -85,10 +87,383 @@@ static void calcPatchSize(cv::Size winS
   
       block.z = patch.z = 1;
   }
- }
- 
- inline int divUp(int total, int grain)
- {
-     return (total + grain - 1) / grain;
- }
   
- // static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
- // {
- //     if (mask.empty())
- //     {
- //         copyTo(src, mat);
- //     }
- //     else
- //     {
- //         mat.create(src.size(), src.type());
- //         copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
- //     }
- // }
- 
- static void arithmetic_run(const oclMat &src1, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
- {
-     if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-     {
-         CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
-         return;
-     }
- 
-     //dst.create(src1.size(), src1.type());
-     //CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-     //          src1.rows == src2.rows && src2.rows == dst.rows);
-     CV_Assert(src1.cols == dst.cols &&
-               src1.rows == dst.rows);
- 
-     CV_Assert(src1.type() == dst.type());
-     CV_Assert(src1.depth() != CV_8S);
- 
-     Context  *clCxt = src1.clCxt;
-     //int channels = dst.channels();
-     //int depth = dst.depth();
- 
-     //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-     //    {4, 0, 4, 4, 1, 1, 1},
-     //    {4, 0, 4, 4, 1, 1, 1},
-     //    {4, 0, 4, 4, 1, 1, 1}
-     //};
- 
-     //size_t vector_length = vector_lengths[channels-1][depth];
-     //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-     //int cols = divUp(dst.cols * channels + offset_cols, vector_length);
- 
-     size_t localThreads[3]  = { 16, 16, 1 };
-     //size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
-     //                               divUp(dst.rows, localThreads[1]) * localThreads[1],
-     //                               1
-     //                             };
-     size_t globalThreads[3] = { src1.cols,
-                                 src1.rows,
-                                 1
-                               };
- 
-     int dst_step1 = dst.cols * dst.elemSize();
-     std::vector<std::pair<size_t , const void *> > args;
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-     //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-     //args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-     //args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
-     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
-     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
- 
-     //if(_scalar != NULL)
-     //{
-     float scalar1 = *((float *)_scalar);
-     args.push_back( std::make_pair( sizeof(float), (float *)&scalar1 ));
-     //}
- 
-     openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH);
- }
- 
- static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
- {
-     arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar));
- }
- 
+ +///////////////////////////////////////////////////////////////////////////
+ +//////////////////////////////// ConvertTo ////////////////////////////////
+ +///////////////////////////////////////////////////////////////////////////
+ +static void convert_run_cus(const oclMat &src, oclMat &dst, double alpha, double beta)
+ +{
+ +    String kernelName = "convert_to_S";
+ +    std::stringstream idxStr;
+ +    idxStr << src.depth();
+ +    kernelName = kernelName + idxStr.str().c_str();
+ +    float alpha_f = (float)alpha, beta_f = (float)beta;
+ +    CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
+ +    std::vector<std::pair<size_t , const void *> > args;
+ +    size_t localThreads[3] = {16, 16, 1};
+ +    size_t globalThreads[3];
+ +    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+ +    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
+ +    globalThreads[2] = 1;
+ +    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
+ +    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
+ +    if(dst.type() == CV_8UC1)
+ +    {
+ +        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
+ +    }
+ +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+ +    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
+ +    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
+ +    openCLExecuteKernel2(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
+ +                         localThreads, args, dst.oclchannels(), dst.depth(), CLFLUSH);
+ +}
+ +void convertTo( const oclMat &src, oclMat &m, int rtype, double alpha = 1, double beta = 0 );
+ +void convertTo( const oclMat &src, oclMat &dst, int rtype, double alpha, double beta )
+ +{
+ +    //cout << "cv::ocl::oclMat::convertTo()" << endl;
+ +
+ +    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
+ +                   && fabs(beta) < std::numeric_limits<double>::epsilon();
+ +
+ +    if( rtype < 0 )
+ +        rtype = src.type();
+ +    else
+ +        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.oclchannels());
+ +
+ +    int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
+ +    if( sdepth == ddepth && noScale )
+ +    {
+ +        src.copyTo(dst);
+ +        return;
+ +    }
+ +
+ +    oclMat temp;
+ +    const oclMat *psrc = &src;
+ +    if( sdepth != ddepth && psrc == &dst )
+ +        psrc = &(temp = src);
+ +
+ +    dst.create( src.size(), rtype );
+ +    convert_run_cus(*psrc, dst, alpha, beta);
+ +}
+ +
+ +///////////////////////////////////////////////////////////////////////////
+ +//////////////////////////////// setTo ////////////////////////////////////
+ +///////////////////////////////////////////////////////////////////////////
+ +//oclMat &operator = (const Scalar &s)
+ +//{
+ +//    //cout << "cv::ocl::oclMat::=" << endl;
+ +//    setTo(s);
+ +//    return *this;
+ +//}
+ +static void set_to_withoutmask_run_cus(const oclMat &dst, const Scalar &scalar, String kernelName)
+ +{
+ +    std::vector<std::pair<size_t , const void *> > args;
+ +
+ +    size_t localThreads[3] = {16, 16, 1};
+ +    size_t globalThreads[3];
+ +    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+ +    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
+ +    globalThreads[2] = 1;
+ +    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
+ +    if(dst.type() == CV_8UC1)
+ +    {
+ +        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+ +    }
+ +    char compile_option[32];
+ +    union sc
+ +    {
+ +        cl_uchar4 uval;
+ +        cl_char4  cval;
+ +        cl_ushort4 usval;
+ +        cl_short4 shval;
+ +        cl_int4 ival;
+ +        cl_float4 fval;
+ +        cl_double4 dval;
+ +    } val;
+ +    switch(dst.depth())
+ +    {
+ +    case 0:
+ +        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
+ +        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
+ +        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
+ +        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=uchar");
+ +            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=uchar4");
+ +            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    case 1:
+ +        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
+ +        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
+ +        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
+ +        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=char");
+ +            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=char4");
+ +            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    case 2:
+ +        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
+ +        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
+ +        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
+ +        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=ushort");
+ +            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=ushort4");
+ +            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    case 3:
+ +        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
+ +        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
+ +        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
+ +        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=short");
+ +            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=short4");
+ +            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    case 4:
+ +        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
+ +        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
+ +        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
+ +        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=int");
+ +            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
+ +            break;
+ +        case 2:
+ +            sprintf(compile_option, "-D GENTYPE=int2");
+ +            cl_int2 i2val;
+ +            i2val.s[0] = val.ival.s[0];
+ +            i2val.s[1] = val.ival.s[1];
+ +            args.push_back( std::make_pair( sizeof(cl_int2) , (void *)&i2val ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=int4");
+ +            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    case 5:
+ +        val.fval.s[0] = (float)scalar.val[0];
+ +        val.fval.s[1] = (float)scalar.val[1];
+ +        val.fval.s[2] = (float)scalar.val[2];
+ +        val.fval.s[3] = (float)scalar.val[3];
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=float");
+ +            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=float4");
+ +            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    case 6:
+ +        val.dval.s[0] = scalar.val[0];
+ +        val.dval.s[1] = scalar.val[1];
+ +        val.dval.s[2] = scalar.val[2];
+ +        val.dval.s[3] = scalar.val[3];
+ +        switch(dst.oclchannels())
+ +        {
+ +        case 1:
+ +            sprintf(compile_option, "-D GENTYPE=double");
+ +            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
+ +            break;
+ +        case 4:
+ +            sprintf(compile_option, "-D GENTYPE=double4");
+ +            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
+ +            break;
+ +        default:
+ +            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
+ +        }
+ +        break;
+ +    default:
+ +        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
+ +    }
+ +#ifdef CL_VERSION_1_2
+ +    if(dst.offset == 0 && dst.cols == dst.wholecols)
+ +    {
+ +        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
+ +    }
+ +    else
+ +    {
+ +        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+ +        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
+ +        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
+ +        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
+ +        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
+ +        openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads,
+ +                             localThreads, args, -1, -1, compile_option, CLFLUSH);
+ +    }
+ +#else
+ +    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
+ +    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
+ +    openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads,
+ +                         localThreads, args, -1, -1, compile_option, CLFLUSH);
+ +#endif
+ +}
+ +
+ +static oclMat &setTo(oclMat &src, const Scalar &scalar)
+ +{
+ +    CV_Assert( src.depth() >= 0 && src.depth() <= 6 );
+ +    CV_DbgAssert( !src.empty());
+ +
+ +    if(src.type() == CV_8UC1)
+ +    {
+ +        set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask_C1_D0");
+ +    }
+ +    else
+ +    {
+ +        set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask");
+ +    }
+ +
+ +    return src;
+ +}
+ +
+ +///////////////////////////////////////////////////////////////////////////
+ +////////////////////////////////// CopyTo /////////////////////////////////
+ +///////////////////////////////////////////////////////////////////////////
+ +// static void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, String kernelName)
+ +// {
+ +//     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
+ +//                   src.rows == dst.rows && src.cols == dst.cols
+ +//                   && mask.type() == CV_8UC1);
+ +
+ +//     std::vector<std::pair<size_t , const void *> > args;
+ +
+ +//     String string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
+ +//         {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
+ +//         {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
+ +//         {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
+ +//     };
+ +//     char compile_option[32];
+ +//     sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
+ +//     size_t localThreads[3] = {16, 16, 1};
+ +//     size_t globalThreads[3];
+ +
+ +//     globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
+ +//     globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
+ +//     globalThreads[2] = 1;
+ +
+ +//     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
+ +//     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
+ +
+ +//     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+ +//     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+ +//     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
+ +//     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
+ +
+ +//     openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
+ +//                          localThreads, args, -1, -1, compile_option, CLFLUSH);
+ +// }
+ +
+ +static void copyTo(const oclMat &src, oclMat &m )
+ +{
+ +    CV_DbgAssert(!src.empty());
+ +    m.create(src.size(), src.type());
+ +    openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset,
+ +                       src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
+ +}
+ +
+ +static void pyrdown_run_cus(const oclMat &src, const oclMat &dst)
+ +{
+ +
+ +    CV_Assert(src.type() == dst.type());
+ +    CV_Assert(src.depth() != CV_8S);
+ +
+ +    Context  *clCxt = src.clCxt;
+ +
+ +    String kernelName = "pyrDown";
+ +
+ +    size_t localThreads[3]  = { 256, 1, 1 };
+ +    size_t globalThreads[3] = { src.cols, dst.rows, 1};
+ +
+ +    std::vector<std::pair<size_t , const void *> > args;
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
+ +
+ +    openCLExecuteKernel2(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth(), CLFLUSH);
+ +}
+ +
+ +static void pyrDown_cus(const oclMat &src, oclMat &dst)
+ +{
+ +    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+ +
+ +    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+ +
+ +    pyrdown_run_cus(src, dst);
+ +}
+ +
   static void lkSparse_run(oclMat &I, oclMat &J,
- -                         const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
- -                         int level, dim3 patch, Size winSize, int iters)
+ +                  const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
+ +                  int level, /*dim3 block, */dim3 patch, Size winSize, int iters)
   {
       Context  *clCxt = I.clCxt;
       int elemCntPerRow = I.step / I.elemSize();
@@@ -576,45 -97,38 +472,46 @@@
       size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
       size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
       int cn = I.oclchannels();
-     char calcErr;
-     if (level == 0)
-     {
-         calcErr = 1;
-     }
-     else
-     {
-         calcErr = 0;
-     }
+     char calcErr = level==0?1:0;
   
- -    vector<pair<size_t , const void *> > args;
+ +    std::vector<std::pair<size_t , const void *> > args;
   
       cl_mem ITex = isImageSupported ? bindTexture(I) : (cl_mem)I.data;
       cl_mem JTex = isImageSupported ? bindTexture(J) : (cl_mem)J.data;
   
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&prevPts.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&prevPts.step ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&nextPts.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&nextPts.step ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&status.data ));
- -    args.push_back( make_pair( sizeof(cl_mem), (void *)&err.data ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&level ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevPts.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevPts.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nextPts.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nextPts.step ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&status.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&err.data ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&level ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
       if (!isImageSupported)
- -        args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&cn ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.width ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height ));
- -    args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
- -    args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));
- -
- -    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
- -    if (is_cpu)
+ +        args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.x ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.y ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cn ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.width ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.height ));
+ +    args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
+ +    args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));
+ +
+ +    if(isImageSupported)
       {
-         openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
- -        openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU");
++        std::stringstream idxStr;
++        idxStr << kernelName.c_str() << "_C" << I.oclchannels() << "_D" << I.depth();
++        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str().c_str());
++        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
++        openCLSafeCall(clReleaseKernel(kernel));
++
++        static char opt[16] = {0};
++        sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
++
++        openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), opt, CLFLUSH);
           releaseTexture(ITex);
           releaseTexture(JTex);
       }
@@@ -656,13 -189,10 +553,11 @@@ void cv::ocl::PyrLKOpticalFlow::sparse(
   
       oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
       oclMat temp2 = nextPts.reshape(1);
-     //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f));
-     multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
-     //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
+     multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
   
       ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
- -    status.setTo(Scalar::all(1));
+ +    //status.setTo(Scalar::all(1));
+ +    setTo(status, Scalar::all(1));
   
       bool errMat = false;
       if (!err)
@@@ -672,10 -202,8 +567,9 @@@
       }
       else
           ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+ +    //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err);
   
       // build the image pyramids.
- 
       prevPyr_.resize(maxLevel + 1);
       nextPyr_.resize(maxLevel + 1);
   
diff --cc modules/ocl/src/safe_call.hpp
Simple merge
diff --cc modules/ocl/test/test_haar.cpp
Simple merge
diff --cc modules/photo/src/denoising.cpp
Simple merge
diff --cc modules/photo/src/fast_nlmeans_denoising_invoker.hpp
Simple merge
diff --cc modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
Simple merge
diff --cc modules/photo/src/precomp.hpp

index 60cc99b,3185a18..38ac3ff
--- 1/modules/photo/src/precomp.hpp
--- 2/modules/photo/src/precomp.hpp
+++ b/modules/photo/src/precomp.hpp
@@@ -43,8 -43,11 +43,9 @@@
   #ifndef __OPENCV_PRECOMP_H__
   #define __OPENCV_PRECOMP_H__
   
- #include "opencv2/photo.hpp"
- -#ifdef HAVE_CVCONFIG_H
- -#include "cvconfig.h"
- -#endif
- -
- -#include "opencv2/photo/photo.hpp"
+ +#include "opencv2/core/private.hpp"
++#include "opencv2/core/utility.hpp"
++#include "opencv2/photo.hpp"
   
   #ifdef HAVE_TEGRA_OPTIMIZATION
   #include "opencv2/photo/photo_tegra.hpp"
diff --cc modules/stitching/src/matchers.cpp
Simple merge
diff --cc modules/stitching/src/precomp.hpp

index 9e056bc,4849ace..b0fa2b1
--- 1/modules/stitching/src/precomp.hpp
--- 2/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@@ -52,8 -55,9 +52,9 @@@
   #include <functional>
   #include <sstream>
   #include <cmath>
- -#include "opencv2/core/core.hpp"
- -#include "opencv2/core/internal.hpp"
- -#include "opencv2/stitching/stitcher.hpp"
+ +#include "opencv2/core.hpp"
++#include "opencv2/core/utility.hpp"
+ +#include "opencv2/stitching.hpp"
   #include "opencv2/stitching/detail/autocalib.hpp"
   #include "opencv2/stitching/detail/blenders.hpp"
   #include "opencv2/stitching/detail/camera.hpp"
diff --cc modules/video/src/bgfg_gaussmix2.cpp

index 35f5d06,6bbb960..2c99c01
--- 1/modules/video/src/bgfg_gaussmix2.cpp
--- 2/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@@ -702,17 -564,16 +702,17 @@@ void BackgroundSubtractorMOG2Impl::appl
   
       parallel_for_(Range(0, image.rows),
                     MOG2Invoker(image, fgmask,
-                              (GMM*)bgmodel.data,
-                              (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
-                              bgmodelUsedModes.data, nmixtures, (float)learningRate,
-                              (float)varThreshold,
-                              backgroundRatio, varThresholdGen,
-                              fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
-                              bShadowDetection, nShadowDetection),
-                   image.total()/(double)(1 << 16));
+                               (GMM*)bgmodel.data,
+                               (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
+                               bgmodelUsedModes.data, nmixtures, (float)learningRate,
+                               (float)varThreshold,
+                               backgroundRatio, varThresholdGen,
+                               fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
- -                              bShadowDetection, nShadowDetection));
++                              bShadowDetection, nShadowDetection),
++                              image.total()/(double)(1 << 16));
   }
   
- -void BackgroundSubtractorMOG2::getBackgroundImage(OutputArray backgroundImage) const
+ +void BackgroundSubtractorMOG2Impl::getBackgroundImage(OutputArray backgroundImage) const
   {
       int nchannels = CV_MAT_CN(frameType);
       CV_Assert( nchannels == 3 );
diff --cc modules/video/src/lkpyramid.cpp

index e465172,291cb86..155737b
--- 1/modules/video/src/lkpyramid.cpp
--- 2/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@@ -159,15 -156,7 +159,15 @@@ cv::detail::LKTrackerInvoker::LKTracker
       minEigThreshold = _minEigThreshold;
   }
   
- void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
+ +#if defined __arm__ && !CV_NEON
+ +typedef int64 acctype;
+ +typedef int itemtype;
+ +#else
+ +typedef float acctype;
+ +typedef float itemtype;
+ +#endif
+ +
+ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
   {
       Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
       const Mat& I = *prevImg;
diff --cc platforms/scripts/camera_build.conf
Simple merge
diff --cc platforms/scripts/cmake_android_all_cameras.py
Simple merge
diff --cc samples/ocl/facedetect.cpp

index dc2f8b2,684c2d9..5ffed2e
--- 1/samples/ocl/facedetect.cpp
--- 2/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@@@ -10,34 -5,48 +8,54 @@@
   #include <iostream>
   #include <stdio.h>
   
+ +int main( int, const char** ) { return 0; }
+ +
+ +#if 0
+ +
   using namespace std;
   using namespace cv;
- -#define LOOP_NUM 10 
++#define LOOP_NUM 10
+ 
+ const static Scalar colors[] =  { CV_RGB(0,0,255),
+         CV_RGB(0,128,255),
+         CV_RGB(0,255,255),
+         CV_RGB(0,255,0),
+         CV_RGB(255,128,0),
+         CV_RGB(255,255,0),
+         CV_RGB(255,0,0),
+         CV_RGB(255,0,255)} ;
   
- static void help()
+ int64 work_begin = 0;
+ int64 work_end = 0;
+ 
- -static void workBegin() 
- -{ 
++static void workBegin()
++{
+     work_begin = getTickCount();
+ }
+ static void workEnd()
   {
-     cout << "\nThis program demonstrates the cascade recognizer.\n"
-         "This classifier can recognize many ~rigid objects, it's most known use is for faces.\n"
-         "Usage:\n"
-         "./facedetect [--cascade=<cascade_path> this is the primary trained classifier such as frontal face]\n"
-         "   [--scale=<image scale greater or equal to 1, try 1.3 for example>\n"
-         "   [filename|camera_index]\n\n"
-         "see facedetect.cmd for one call:\n"
-         "./facedetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --scale=1.3 \n"
-         "Hit any key to quit.\n"
-         "Using OpenCV version " << CV_VERSION << "\n" << endl;
+     work_end += (getTickCount() - work_begin);
   }
- struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } };
- void detectAndDraw( Mat& img,
-     cv::ocl::OclCascadeClassifier& cascade, CascadeClassifier& nestedCascade,
-     double scale);
+ +
+ +
- -void detect( Mat& img, vector<Rect>& faces, 
- -    cv::ocl::OclCascadeClassifierBuf& cascade, 
+ static double getTime(){
+     return work_end /((double)cvGetTickFrequency() * 1000.);
+ }
+ 
++void detect( Mat& img, vector<Rect>& faces,
++    cv::ocl::OclCascadeClassifierBuf& cascade,
+     double scale, bool calTime);
   
- string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
- -void detectCPU( Mat& img, vector<Rect>& faces, 
- -    CascadeClassifier& cascade, 
++void detectCPU( Mat& img, vector<Rect>& faces,
++    CascadeClassifier& cascade,
+     double scale, bool calTime);
+ 
+ void Draw(Mat& img, vector<Rect>& faces, double scale);
+ 
+ // This function test if gpu_rst matches cpu_rst.
+ // If the two vectors are not equal, it will return the difference in vector size
+ // Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
+ double checkRectSimilarity(Size sz, std::vector<Rect>& cpu_rst, std::vector<Rect>& gpu_rst);
   
   int main( int argc, const char** argv )
   {
@@@ -82,14 -80,11 +89,11 @@@
           return -1;
       }
   
-     if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') )
+     if( inputName.empty() )
       {
-         capture = cvCaptureFromCAM( inputName.empty() ? 0 : inputName.c_str()[0] - '0' );
-         int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0' ;
-         if(!capture) cout << "Capture from CAM " <<  c << " didn't work" << endl;
+         capture = cvCaptureFromCAM(0);
- -        if(!capture) 
++        if(!capture)
+             cout << "Capture from CAM 0 didn't work" << endl;
       }
       else if( inputName.size() )
       {
@@@ -97,13 -92,17 +101,17 @@@
           if( image.empty() )
           {
               capture = cvCaptureFromAVI( inputName.c_str() );
-             if(!capture) cout << "Capture from AVI didn't work" << endl;
- -            if(!capture) 
++            if(!capture)
+                 cout << "Capture from AVI didn't work" << endl;
+             return -1;
           }
       }
       else
       {
           image = imread( "lena.jpg", 1 );
-         if(image.empty()) cout << "Couldn't read lena.jpg" << endl;
- -        if(image.empty()) 
++        if(image.empty())
+             cout << "Couldn't read lena.jpg" << endl;
+         return -1;
       }
   
       cvNamedWindow( "result", 1 );
@@@ -123,16 -122,21 +131,21 @@@
           for(;;)
           {
               IplImage* iplImg = cvQueryFrame( capture );
- -            frame = iplImg;
+ +            frame = cv::cvarrToMat(iplImg);
+             vector<Rect> faces;
               if( frame.empty() )
                   break;
               if( iplImg->origin == IPL_ORIGIN_TL )
                   frame.copyTo( frameCopy );
               else
                   flip( frame, frameCopy, 0 );
- 
-             detectAndDraw( frameCopy, cascade, nestedCascade, scale );
- 
+             if(useCPU){
+                 detectCPU(frameCopy, faces, cpu_cascade, scale, false);
+             }
+             else{
- -                detect(frameCopy, faces, cascade, scale, false);     
++                detect(frameCopy, faces, cascade, scale, false);
+             }
+             Draw(frameCopy, faces, scale);
               if( waitKey( 10 ) >= 0 )
                   goto _cleanup_;
           }
@@@ -145,42 -149,34 +158,34 @@@ _cleanup_
       else
       {
           cout << "In image read" << endl;
-         if( !image.empty() )
-         {
-             detectAndDraw( image, cascade, nestedCascade, scale );
-             waitKey(0);
-         }
-         else if( !inputName.empty() )
+         vector<Rect> faces;
+         vector<Rect> ref_rst;
+         double accuracy = 0.;
- -        for(int i = 0; i <= LOOP_NUM;i ++) 
++        for(int i = 0; i <= LOOP_NUM;i ++)
           {
-             /* assume it is a text file containing the
-             list of the image filenames to be processed - one per line */
-             FILE* f = fopen( inputName.c_str(), "rt" );
-             if( f )
-             {
-                 char buf[1000+1];
-                 while( fgets( buf, 1000, f ) )
-                 {
-                     int len = (int)strlen(buf), c;
-                     while( len > 0 && isspace(buf[len-1]) )
-                         len--;
-                     buf[len] = '\0';
-                     cout << "file " << buf << endl;
-                     image = imread( buf, 1 );
-                     if( !image.empty() )
-                     {
-                         detectAndDraw( image, cascade, nestedCascade, scale );
-                         c = waitKey(0);
-                         if( c == 27 || c == 'q' || c == 'Q' )
-                             break;
-                     }
-                     else
-                     {
-                         cerr << "Aw snap, couldn't read image " << buf << endl;
-                     }
+             cout << "loop" << i << endl;
+             if(useCPU){
- -                detectCPU(image, faces, cpu_cascade, scale, i==0?false:true);  
++                detectCPU(image, faces, cpu_cascade, scale, i==0?false:true);
+             }
+             else{
+                 detect(image, faces, cascade, scale, i==0?false:true);
+                 if(i == 0){
+                     detectCPU(image, ref_rst, cpu_cascade, scale, false);
+                     accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
- -                }                    
+ +                }
-                 fclose(f);
+             }
+             if (i == LOOP_NUM)
+             {
+                 if (useCPU)
+                     cout << "average CPU time (noCamera) : ";
+                 else
+                     cout << "average GPU time (noCamera) : ";
+                 cout << getTime() / LOOP_NUM << " ms" << endl;
+                 cout << "accuracy value: " << accuracy <<endl;
               }
           }
+         Draw(image, faces, scale);
+         waitKey(0);
       }
   
       cvDestroyWindow("result");
@@@ -188,25 -184,14 +193,14 @@@
       return 0;
   }
   
- void detectAndDraw( Mat& img,
-     cv::ocl::OclCascadeClassifier& cascade, CascadeClassifier&,
-     double scale)
- -void detect( Mat& img, vector<Rect>& faces, 
- -    cv::ocl::OclCascadeClassifierBuf& cascade, 
++void detect( Mat& img, vector<Rect>& faces,
++    cv::ocl::OclCascadeClassifierBuf& cascade,
+     double scale, bool calTime)
   {
-     int i = 0;
-     double t = 0;
-     vector<Rect> faces;
-     const static Scalar colors[] =  { CV_RGB(0,0,255),
-         CV_RGB(0,128,255),
-         CV_RGB(0,255,255),
-         CV_RGB(0,255,0),
-         CV_RGB(255,128,0),
-         CV_RGB(255,255,0),
-         CV_RGB(255,0,0),
-         CV_RGB(255,0,255)} ;
       cv::ocl::oclMat image(img);
       cv::ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
- 
+     if(calTime) workBegin();
- -    cv::ocl::cvtColor( image, gray, CV_BGR2GRAY );
+ +    cv::ocl::cvtColor( image, gray, COLOR_BGR2GRAY );
       cv::ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
       cv::ocl::equalizeHist( smallImg, smallImg );
   
@@@ -217,15 -199,29 +208,29 @@@
           3, 0
           |CV_HAAR_SCALE_IMAGE
           , Size(30,30), Size(0, 0) );
-     vector<CvAvgComp> vecAvgComp;
-     Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-     faces.resize(vecAvgComp.size());
-     std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
-     t = (double)cvGetTickCount() - t;
-     printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
+     if(calTime) workEnd();
+ }
+ 
- -void detectCPU( Mat& img, vector<Rect>& faces, 
- -    CascadeClassifier& cascade, 
++void detectCPU( Mat& img, vector<Rect>& faces,
++    CascadeClassifier& cascade,
+     double scale, bool calTime)
+ {
+     if(calTime) workBegin();
+     Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+     cvtColor(img, cpu_gray, CV_BGR2GRAY);
+     resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
+     equalizeHist(cpu_smallImg, cpu_smallImg);
+     cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
+         3, 0 | CV_HAAR_SCALE_IMAGE,
+         Size(30, 30), Size(0, 0));
- -    if(calTime) workEnd(); 
++    if(calTime) workEnd();
+ }
+ 
+ void Draw(Mat& img, vector<Rect>& faces, double scale)
+ {
+     int i = 0;
       for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
       {
-         Mat smallImgROI;
           Point center;
           Scalar color = colors[i%8];
           int radius;
@@@ -236,4 -232,42 +241,43 @@@
       }
       cv::imshow( "result", img );
   }
- -        {      
+ 
+ double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+ {
+     double final_test_result = 0.0;
+     size_t sz1 = ob1.size();
+     size_t sz2 = ob2.size();
+ 
+     if(sz1 != sz2)
+         return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+     else
+     {
+         cv::Mat cpu_result(sz, CV_8UC1);
+         cpu_result.setTo(0);
+ 
+         for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
++        {
+             cv::Mat cpu_result_roi(cpu_result, *r);
+             cpu_result_roi.setTo(1);
+             cpu_result.copyTo(cpu_result);
+         }
+         int cpu_area = cv::countNonZero(cpu_result > 0);
+ 
+         cv::Mat gpu_result(sz, CV_8UC1);
+         gpu_result.setTo(0);
+         for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
+         {
+             cv::Mat gpu_result_roi(gpu_result, *r2);
+             gpu_result_roi.setTo(1);
+             gpu_result.copyTo(gpu_result);
+         }
+ 
+         cv::Mat result_;
+         multiply(cpu_result, gpu_result, result_);
+         int result = cv::countNonZero(result_ > 0);
+ 
+         final_test_result = 1.0 - (double)result/(double)cpu_area;
+     }
+     return final_test_result;
+ }
+ +#endif
diff --cc samples/ocl/hog.cpp

index 8a310e4,28be6fa..daff267
--- 1/samples/ocl/hog.cpp
--- 2/samples/ocl/hog.cpp
+++ b/samples/ocl/hog.cpp
@@@ -317,7 -323,18 +324,18 @@@ void App::run(
                   gpu_img.upload(img);
                   gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
                                            Size(0, 0), scale, gr_threshold);
-             }
+                 if (!verify)
+                 {
+                     // verify if GPU output same objects with CPU at 1st run
+                     verify = true;
+                     vector<Rect> ref_rst;
- -                    cvtColor(img, img, CV_BGRA2BGR);
++                    cvtColor(img, img, COLOR_BGRA2BGR);
+                     cpu_hog.detectMultiScale(img, ref_rst, hit_threshold, win_stride,
+                                               Size(0, 0), scale, gr_threshold-2);
+                     double accuracy = checkRectSimilarity(img.size(), ref_rst, found);
+                     cout << "\naccuracy value: " << accuracy << endl;           
+                 } 
+            }
               else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
                                             Size(0, 0), scale, gr_threshold);
               hogWorkEnd();
diff --cc samples/ocl/pyrlk_optical_flow.cpp

index 0000000,1b2b1d3..392d455

mode 000000,100644..100644
--- /dev/null
--- 2/samples/ocl/pyrlk_optical_flow.cpp
+++ b/samples/ocl/pyrlk_optical_flow.cpp
@@@ -1,0 -1,290 +1,287 @@@
- -        "{ h            | help           | false | print help message }"
- -        "{ l            | left           |       | specify left image }"
- -        "{ r            | right          |       | specify right image }"
- -        "{ c            | camera         | 0     | enable camera capturing }"
- -        "{ s            | use_cpu        | false | use cpu or gpu to process the image }"
- -        "{ v            | video          |       | use video as input }"
- -        "{ points       | points         | 1000  | specify points count [GoodFeatureToTrack] }"
- -        "{ min_dist     | min_dist       | 0     | specify minimal distance between points [GoodFeatureToTrack] }";
+ #include <iostream>
+ #include <vector>
+ #include <iomanip>
+ 
++#include "opencv2/core/utility.hpp"
+ #include "opencv2/highgui/highgui.hpp"
+ #include "opencv2/ocl/ocl.hpp"
+ #include "opencv2/video/video.hpp"
+ 
+ using namespace std;
+ using namespace cv;
+ using namespace cv::ocl;
+ 
+ typedef unsigned char uchar;
+ #define LOOP_NUM 10 
+ int64 work_begin = 0;
+ int64 work_end = 0;
+ 
+ static void workBegin() 
+ { 
+     work_begin = getTickCount();
+ }
+ static void workEnd()
+ {
+     work_end += (getTickCount() - work_begin);
+ }
+ static double getTime(){
+     return work_end * 1000. / getTickFrequency();
+ }
+ 
+ static void download(const oclMat& d_mat, vector<Point2f>& vec)
+ {
+     vec.resize(d_mat.cols);
+     Mat mat(1, d_mat.cols, CV_32FC2, (void*)&vec[0]);
+     d_mat.download(mat);
+ }
+ 
+ static void download(const oclMat& d_mat, vector<uchar>& vec)
+ {
+     vec.resize(d_mat.cols);
+     Mat mat(1, d_mat.cols, CV_8UC1, (void*)&vec[0]);
+     d_mat.download(mat);
+ }
+ 
+ static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<Point2f>& nextPts, const vector<uchar>& status, Scalar line_color = Scalar(0, 0, 255))
+ {
+     for (size_t i = 0; i < prevPts.size(); ++i)
+     {
+         if (status[i])
+         {
+             int line_thickness = 1;
+ 
+             Point p = prevPts[i];
+             Point q = nextPts[i];
+ 
+             double angle = atan2((double) p.y - q.y, (double) p.x - q.x);
+ 
+             double hypotenuse = sqrt( (double)(p.y - q.y)*(p.y - q.y) + (double)(p.x - q.x)*(p.x - q.x) );
+ 
+             if (hypotenuse < 1.0)
+                 continue;
+ 
+             // Here we lengthen the arrow by a factor of three.
+             q.x = (int) (p.x - 3 * hypotenuse * cos(angle));
+             q.y = (int) (p.y - 3 * hypotenuse * sin(angle));
+ 
+             // Now we draw the main line of the arrow.
+             line(frame, p, q, line_color, line_thickness);
+ 
+             // Now draw the tips of the arrow. I do some scaling so that the
+             // tips look proportional to the main line of the arrow.
+ 
+             p.x = (int) (q.x + 9 * cos(angle + CV_PI / 4));
+             p.y = (int) (q.y + 9 * sin(angle + CV_PI / 4));
+             line(frame, p, q, line_color, line_thickness);
+ 
+             p.x = (int) (q.x + 9 * cos(angle - CV_PI / 4));
+             p.y = (int) (q.y + 9 * sin(angle - CV_PI / 4));
+             line(frame, p, q, line_color, line_thickness);
+         }
+     }
+ }
+ 
+ 
+ int main(int argc, const char* argv[])
+ {
+     static std::vector<Info> ocl_info;
+     ocl::getDevice(ocl_info);
+     //if you want to use undefault device, set it here
+     setDevice(ocl_info[0]);
+ 
+     //set this to save kernel compile time from second time you run
+     ocl::setBinpath("./");
+     const char* keys =
- -    if (cmd.get<bool>("help"))
++        "{ help h           | false | print help message }"
++        "{ left l           |       | specify left image }"
++        "{ right r          |       | specify right image }"
++        "{ camera c         | 0     | enable camera capturing }"
++        "{ use_cpu s        | false | use cpu or gpu to process the image }"
++        "{ video v          |       | use video as input }"
++        "{ points           | 1000  | specify points count [GoodFeatureToTrack] }"
++        "{ min_dist         | 0     | specify minimal distance between points [GoodFeatureToTrack] }";
+ 
+     CommandLineParser cmd(argc, argv, keys);
+ 
- -        cout << "Usage: pyrlk_optical_flow [options]" << endl;
- -        cout << "Avaible options:" << endl;
- -        cmd.printParams();
++    if (cmd.has("help"))
+     {
- -    bool useCPU = cmd.get<bool>("s");
- -    bool useCamera = cmd.get<bool>("c");
++        cmd.printMessage();
+         return 0;
+     }
+ 
+     bool defaultPicturesFail = false;
+     string fname0 = cmd.get<string>("left");
+     string fname1 = cmd.get<string>("right");
+     string vdofile = cmd.get<string>("video");
+     int points = cmd.get<int>("points");
+     double minDist = cmd.get<double>("min_dist");
- -        CvCapture* capture = 0;
- -        capture = cvCaptureFromCAM( inputName );
- -        if (!capture)
++    bool useCPU = cmd.has("s");
++    bool useCamera = cmd.has("c");
+     int inputName = cmd.get<int>("c");
+     oclMat d_nextPts, d_status;
+ 
+     Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
+     Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
+     PyrLKOpticalFlow d_pyrLK;
+     vector<cv::Point2f> pts;
+     vector<cv::Point2f> nextPts;
+     vector<unsigned char> status;
+     vector<float> err;
+ 
+     if (frame0.empty() || frame1.empty())
+     {
+         useCamera = true;
+         defaultPicturesFail = true;
- -        CvCapture* capture = 0;
++        VideoCapture capture(inputName);
++        if (!capture.isOpened())
+         {
+             cout << "Can't load input images" << endl;
+             return -1;
+         }
+     }
+ 
+     cout << "Points count : " << points << endl << endl;
+ 
+     if (useCamera)
+     {
- -            capture = cvCaptureFromCAM( inputName );
++        VideoCapture capture;
+         Mat frame, frameCopy;
+         Mat frame0Gray, frame1Gray;
+         Mat ptr0, ptr1;
+ 
+         if(vdofile == "")
- -            capture = cvCreateFileCapture(vdofile.c_str());
++            capture.open( inputName );
+         else
- -        if(!capture)
++            capture.open(vdofile.c_str());
+ 
+         int c = inputName ;
- -            frame = cvQueryFrame( capture );
- -            if( frame.empty() )
++        if(!capture.isOpened())
+         {
+             if(vdofile == "")
+                 cout << "Capture from CAM " << c << " didn't work" << endl;
+             else
+                 cout << "Capture from file " << vdofile << " failed" <<endl;
+             if (defaultPicturesFail)
+             {
+                 return -1;
+             }
+             goto nocamera;
+         }
+ 
+         cout << "In capture ..." << endl;
+         for(int i = 0;; i++)
+         {
- -        cvReleaseCapture( &capture );
++            if( !capture.read(frame) )
+                 break;
+ 
+             if (i == 0)
+             {
+                 frame.copyTo( frame0 );
+                 cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+             }
+             else
+             {
+                 if (i%2 == 1)
+                 {
+                     frame.copyTo(frame1);
+                     cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
+                     ptr0 = frame0Gray;
+                     ptr1 = frame1Gray;
+                 }
+                 else
+                 {
+                     frame.copyTo(frame0);
+                     cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+                     ptr0 = frame1Gray;
+                     ptr1 = frame0Gray;
+                 }
+ 
+                 pts.clear();
+ 
+                 cv::goodFeaturesToTrack(ptr0, pts, points, 0.01, 0.0);
+ 
+                 if (pts.size() == 0)
+                 {
+                     continue;
+                 }
+ 
+                 if (useCPU)
+                 {
+                     cv::calcOpticalFlowPyrLK(ptr0, ptr1, pts, nextPts, status, err);
+                 }
+                 else
+                 {
+                     oclMat d_prevPts(1, points, CV_32FC2, (void*)&pts[0]);
+ 
+                     d_pyrLK.sparse(oclMat(ptr0), oclMat(ptr1), d_prevPts, d_nextPts, d_status);
+ 
+                     download(d_prevPts, pts);
+                     download(d_nextPts, nextPts);
+                     download(d_status, status);
+ 
+                 }
+                 if (i%2 == 1)
+                     frame1.copyTo(frameCopy);
+                 else
+                     frame0.copyTo(frameCopy);
+                 drawArrows(frameCopy, pts, nextPts, status, Scalar(255, 0, 0));
+                 imshow("PyrLK [Sparse]", frameCopy);
+             }
+ 
+             if( waitKey( 10 ) >= 0 )
+                 goto _cleanup_;
+         }
+ 
+         waitKey(0);
+ 
+ _cleanup_:
++        capture.release();
+     }
+     else
+     {
+ nocamera:
+         for(int i = 0; i <= LOOP_NUM;i ++) 
+         {
+             cout << "loop" << i << endl;
+             if (i > 0) workBegin();
+ 
+             cv::goodFeaturesToTrack(frame0, pts, points, 0.01, minDist);
+ 
+             if (useCPU)
+             {
+                 cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+             }
+             else
+             {
+                 oclMat d_prevPts(1, points, CV_32FC2, (void*)&pts[0]);
+ 
+                 d_pyrLK.sparse(oclMat(frame0), oclMat(frame1), d_prevPts, d_nextPts, d_status);
+ 
+                 download(d_prevPts, pts);
+                 download(d_nextPts, nextPts);
+                 download(d_status, status);
+             }
+ 
+             if (i > 0 && i <= LOOP_NUM)
+                 workEnd();
+ 
+             if (i == LOOP_NUM)
+             {
+                 if (useCPU)
+                     cout << "average CPU time (noCamera) : ";
+                 else
+                     cout << "average GPU time (noCamera) : ";
+ 
+                 cout << getTime() / LOOP_NUM << " ms" << endl;
+ 
+                 drawArrows(frame0, pts, nextPts, status, Scalar(255, 0, 0));
+ 
+                 imshow("PyrLK [Sparse]", frame0);
+             }
+         }
+     }
+ 
+     waitKey();
+ 
+     return 0;
+ }
diff --cc samples/ocl/stereo_match.cpp

index 0000000,7ac2c9a..8cc6530

mode 000000,100644..100644
--- /dev/null
--- 2/samples/ocl/stereo_match.cpp
+++ b/samples/ocl/stereo_match.cpp
@@@ -1,0 -1,419 +1,421 @@@
- -    cvtColor(left_src, left, CV_BGR2GRAY);
- -    cvtColor(right_src, right, CV_BGR2GRAY);
+ #include <iostream>
+ #include <string>
+ #include <sstream>
+ #include <iomanip>
+ #include <stdexcept>
++
++#include "opencv2/core/utility.hpp"
+ #include "opencv2/ocl/ocl.hpp"
+ #include "opencv2/highgui/highgui.hpp"
+ 
+ using namespace cv;
+ using namespace std;
+ using namespace ocl;
+ 
+ bool help_showed = false;
+ 
+ struct Params
+ {
+     Params();
+     static Params read(int argc, char** argv);
+ 
+     string left;
+     string right;
+ 
+     string method_str() const
+     {
+         switch (method)
+         {
+         case BM: return "BM";
+         case BP: return "BP";
+         case CSBP: return "CSBP";
+         }
+         return "";
+     }
+     enum {BM, BP, CSBP} method;
+     int ndisp; // Max disparity + 1
+     enum {GPU, CPU} type;
+ };
+ 
+ 
+ struct App
+ {
+     App(const Params& p);
+     void run();
+     void handleKey(char key);
+     void printParams() const;
+ 
+     void workBegin() { work_begin = getTickCount(); }
+     void workEnd()
+     {
+         int64 d = getTickCount() - work_begin;
+         double f = getTickFrequency();
+         work_fps = f / d;
+     }
+ 
+     string text() const
+     {
+         stringstream ss;
+         ss << "(" << p.method_str() << ") FPS: " << setiosflags(ios::left)
+             << setprecision(4) << work_fps;
+         return ss.str();
+     }
+ private:
+     Params p;
+     bool running;
+ 
+     Mat left_src, right_src;
+     Mat left, right;
+     oclMat d_left, d_right;
+ 
+     StereoBM_OCL bm;
+     StereoBeliefPropagation bp;
+     StereoConstantSpaceBP csbp;
+ 
+     int64 work_begin;
+     double work_fps;
+ };
+ 
+ static void printHelp()
+ {
+     cout << "Usage: stereo_match_gpu\n"
+         << "\t--left <left_view> --right <right_view> # must be rectified\n"
+         << "\t--method <stereo_match_method> # BM | BP | CSBP\n"
+         << "\t--ndisp <number> # number of disparity levels\n"
+         << "\t--type <device_type> # cpu | CPU | gpu | GPU\n";
+     help_showed = true;
+ }
+ 
+ int main(int argc, char** argv)
+ {
+     try
+     {
+         if (argc < 2)
+         {
+             printHelp();
+             return 1;
+         }
+ 
+         Params args = Params::read(argc, argv);
+         if (help_showed)
+             return -1;
+ 
+         int flags[2] = { CVCL_DEVICE_TYPE_GPU, CVCL_DEVICE_TYPE_CPU };
+         vector<Info> info;
+ 
+         if(getDevice(info, flags[args.type]) == 0)
+         {
+             throw runtime_error("Error: Did not find a valid OpenCL device!");
+         }
+         cout << "Device name:" << info[0].DeviceName[0] << endl;
+ 
+         App app(args);
+         app.run();
+     }
+     catch (const exception& e)
+     {
+         cout << "error: " << e.what() << endl;
+     }
+     return 0;
+ }
+ 
+ 
+ Params::Params()
+ {
+     method = BM;
+     ndisp = 64;
+     type = GPU;
+ }
+ 
+ 
+ Params Params::read(int argc, char** argv)
+ {
+     Params p;
+ 
+     for (int i = 1; i < argc; i++)
+     {
+         if (string(argv[i]) == "--left") p.left = argv[++i];
+         else if (string(argv[i]) == "--right") p.right = argv[++i];
+         else if (string(argv[i]) == "--method")
+         {
+             if (string(argv[i + 1]) == "BM") p.method = BM;
+             else if (string(argv[i + 1]) == "BP") p.method = BP;
+             else if (string(argv[i + 1]) == "CSBP") p.method = CSBP;
+             else throw runtime_error("unknown stereo match method: " + string(argv[i + 1]));
+             i++;
+         }
+         else if (string(argv[i]) == "--ndisp") p.ndisp = atoi(argv[++i]);
+         else if (string(argv[i]) == "--type")
+         {
+             string t(argv[++i]);
+             if (t == "cpu" || t == "CPU")
+             {
+                 p.type = CPU;
+             } 
+             else if (t == "gpu" || t == "GPU")
+             {
+                 p.type = GPU;
+             }
+             else throw runtime_error("unknown device type: " + t);
+         }
+         else if (string(argv[i]) == "--help") printHelp();
+         else throw runtime_error("unknown key: " + string(argv[i]));
+     }
+ 
+     return p;
+ }
+ 
+ 
+ App::App(const Params& params)
+     : p(params), running(false)
+ {
+     cout << "stereo_match_ocl sample\n";
+     cout << "\nControls:\n"
+         << "\tesc - exit\n"
+         << "\tp - print current parameters\n"
+         << "\tg - convert source images into gray\n"
+         << "\tm - change stereo match method\n"
+         << "\ts - change Sobel prefiltering flag (for BM only)\n"
+         << "\t1/q - increase/decrease maximum disparity\n"
+         << "\t2/w - increase/decrease window size (for BM only)\n"
+         << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
+         << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+ }
+ 
+ 
+ void App::run()
+ {
+     // Load images
+     left_src = imread(p.left);
+     right_src = imread(p.right);
+     if (left_src.empty()) throw runtime_error("can't open file \"" + p.left + "\"");
+     if (right_src.empty()) throw runtime_error("can't open file \"" + p.right + "\"");
+ 
- -                cvtColor(left_src, left, CV_BGR2GRAY);
- -                cvtColor(right_src, right, CV_BGR2GRAY);
++    cvtColor(left_src, left, COLOR_BGR2GRAY);
++    cvtColor(right_src, right, COLOR_BGR2GRAY);
+ 
+     d_left.upload(left);
+     d_right.upload(right);
+ 
+     imshow("left", left);
+     imshow("right", right);
+ 
+     // Set common parameters
+     bm.ndisp = p.ndisp;
+     bp.ndisp = p.ndisp;
+     csbp.ndisp = p.ndisp;
+ 
+     cout << endl;
+     printParams();
+ 
+     running = true;
+     while (running)
+     {
+ 
+         // Prepare disparity map of specified type
+         Mat disp;
+         oclMat d_disp;
+         workBegin();
+         switch (p.method)
+         {
+         case Params::BM:
+             if (d_left.channels() > 1 || d_right.channels() > 1)
+             {
+                 cout << "BM doesn't support color images\n";
- -            cvtColor(left_src, left, CV_BGR2GRAY);
- -            cvtColor(right_src, right, CV_BGR2GRAY);
++                cvtColor(left_src, left, COLOR_BGR2GRAY);
++                cvtColor(right_src, right, COLOR_BGR2GRAY);
+                 cout << "image_channels: " << left.channels() << endl;
+                 d_left.upload(left);
+                 d_right.upload(right);
+                 imshow("left", left);
+                 imshow("right", right);
+             }
+             bm(d_left, d_right, d_disp);
+             break;
+         case Params::BP:
+             bp(d_left, d_right, d_disp);
+             break;
+         case Params::CSBP:
+             csbp(d_left, d_right, d_disp);
+             break;
+         }
+         ocl::finish();
+         workEnd();
+ 
+         // Show results
+         d_disp.download(disp);
+         if (p.method != Params::BM)
+         {
+             disp.convertTo(disp, 0);
+         }
+         putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
+         imshow("disparity", disp);
+ 
+         handleKey((char)waitKey(3));
+     }
+ }
+ 
+ 
+ void App::printParams() const
+ {
+     cout << "--- Parameters ---\n";
+     cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
+     cout << "image_channels: " << left.channels() << endl;
+     cout << "method: " << p.method_str() << endl
+         << "ndisp: " << p.ndisp << endl;
+     switch (p.method)
+     {
+     case Params::BM:
+         cout << "win_size: " << bm.winSize << endl;
+         cout << "prefilter_sobel: " << bm.preset << endl;
+         break;
+     case Params::BP:
+         cout << "iter_count: " << bp.iters << endl;
+         cout << "level_count: " << bp.levels << endl;
+         break;
+     case Params::CSBP:
+         cout << "iter_count: " << csbp.iters << endl;
+         cout << "level_count: " << csbp.levels << endl;
+         break;
+     }
+     cout << endl;
+ }
+ 
+ 
+ void App::handleKey(char key)
+ {
+     switch (key)
+     {
+     case 27:
+         running = false;
+         break;
+     case 'p': case 'P':
+         printParams();
+         break;
+     case 'g': case 'G':
+         if (left.channels() == 1 && p.method != Params::BM)
+         {
+             left = left_src;
+             right = right_src;
+         }
+         else
+         {
++            cvtColor(left_src, left, COLOR_BGR2GRAY);
++            cvtColor(right_src, right, COLOR_BGR2GRAY);
+         }
+         d_left.upload(left);
+         d_right.upload(right);
+         cout << "image_channels: " << left.channels() << endl;
+         imshow("left", left);
+         imshow("right", right);
+         break;
+     case 'm': case 'M':
+         switch (p.method)
+         {
+         case Params::BM:
+             p.method = Params::BP;
+             break;
+         case Params::BP:
+             p.method = Params::CSBP;
+             break;
+         case Params::CSBP:
+             p.method = Params::BM;
+             break;
+         }
+         cout << "method: " << p.method_str() << endl;
+         break;
+     case 's': case 'S':
+         if (p.method == Params::BM)
+         {
+             switch (bm.preset)
+             {
+             case StereoBM_OCL::BASIC_PRESET:
+                 bm.preset = StereoBM_OCL::PREFILTER_XSOBEL;
+                 break;
+             case StereoBM_OCL::PREFILTER_XSOBEL:
+                 bm.preset = StereoBM_OCL::BASIC_PRESET;
+                 break;
+             }
+             cout << "prefilter_sobel: " << bm.preset << endl;
+         }
+         break;
+     case '1':
+         p.ndisp = p.ndisp == 1 ? 8 : p.ndisp + 8;
+         cout << "ndisp: " << p.ndisp << endl;
+         bm.ndisp = p.ndisp;
+         bp.ndisp = p.ndisp;
+         csbp.ndisp = p.ndisp;
+         break;
+     case 'q': case 'Q':
+         p.ndisp = max(p.ndisp - 8, 1);
+         cout << "ndisp: " << p.ndisp << endl;
+         bm.ndisp = p.ndisp;
+         bp.ndisp = p.ndisp;
+         csbp.ndisp = p.ndisp;
+         break;
+     case '2':
+         if (p.method == Params::BM)
+         {
+             bm.winSize = min(bm.winSize + 1, 51);
+             cout << "win_size: " << bm.winSize << endl;
+         }
+         break;
+     case 'w': case 'W':
+         if (p.method == Params::BM)
+         {
+             bm.winSize = max(bm.winSize - 1, 2);
+             cout << "win_size: " << bm.winSize << endl;
+         }
+         break;
+     case '3':
+         if (p.method == Params::BP)
+         {
+             bp.iters += 1;
+             cout << "iter_count: " << bp.iters << endl;
+         }
+         else if (p.method == Params::CSBP)
+         {
+             csbp.iters += 1;
+             cout << "iter_count: " << csbp.iters << endl;
+         }
+         break;
+     case 'e': case 'E':
+         if (p.method == Params::BP)
+         {
+             bp.iters = max(bp.iters - 1, 1);
+             cout << "iter_count: " << bp.iters << endl;
+         }
+         else if (p.method == Params::CSBP)
+         {
+             csbp.iters = max(csbp.iters - 1, 1);
+             cout << "iter_count: " << csbp.iters << endl;
+         }
+         break;
+     case '4':
+         if (p.method == Params::BP)
+         {
+             bp.levels += 1;
+             cout << "level_count: " << bp.levels << endl;
+         }
+         else if (p.method == Params::CSBP)
+         {
+             csbp.levels += 1;
+             cout << "level_count: " << csbp.levels << endl;
+         }
+         break;
+     case 'r': case 'R':
+         if (p.method == Params::BP)
+         {
+             bp.levels = max(bp.levels - 1, 1);
+             cout << "level_count: " << bp.levels << endl;
+         }
+         else if (p.method == Params::CSBP)
+         {
+             csbp.levels = max(csbp.levels - 1, 1);
+             cout << "level_count: " << csbp.levels << endl;
+         }
+         break;
+     }
+ }
+ 
+
author	Roman Donchenko <roman.donchenko@itseez.com>
	Tue, 4 Jun 2013 14:31:51 +0000 (18:31 +0400)
committer	Roman Donchenko <roman.donchenko@itseez.com>
	Wed, 5 Jun 2013 11:42:07 +0000 (15:42 +0400)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
apps/traincascade/boost.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/calib3d/src/solvepnp.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/core/include/opencv2/core/cuda/detail/color_detail.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/core/include/opencv2/core/cuda/functional.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/core/include/opencv2/core/cuda/utility.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/features2d/src/detectors.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/src/calib3d.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/src/cuda/calib3d.cu	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/src/cuda/canny.cu	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/src/cuda/element_operations.cu	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/src/matrix_reductions.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/gpu/test/test_core.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/doc/miscellaneous_transformations.rst	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/src/color.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/src/distransform.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/src/histogram.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/src/morph.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ml/src/gbt.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ml/src/svm.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/nonfree/src/surf.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/objdetect/src/cascadedetect.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/objdetect/src/latentsvm.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/include/opencv2/ocl.hpp	patch \|	diff1 \|	\|	blob \| history
modules/ocl/include/opencv2/ocl/private/util.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/perf_canny.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/perf_color.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/perf_haar.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/perf_imgproc.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/perf_match_template.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/precomp.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/perf/precomp.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/arithm.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/brute_force_matcher.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/canny.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/filtering.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/gfft.cpp	patch \|	\|	diff2 \|	blob \| history
modules/ocl/src/haar.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/imgproc.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/mcwutil.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/opencl/haarobjectdetect.cl	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/precomp.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/pyrlk.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/src/safe_call.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/ocl/test/test_haar.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/photo/src/denoising.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/photo/src/fast_nlmeans_denoising_invoker.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/photo/src/precomp.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/stitching/src/matchers.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/stitching/src/precomp.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/video/src/bgfg_gaussmix2.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/video/src/lkpyramid.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
platforms/scripts/camera_build.conf	patch \|	diff1 \|	diff2 \|	blob \| history
platforms/scripts/cmake_android_all_cameras.py	patch \|	diff1 \|	diff2 \|	blob \| history
samples/ocl/facedetect.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
samples/ocl/hog.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
samples/ocl/pyrlk_optical_flow.cpp	patch \|	\|	diff2 \|	blob \| history
samples/ocl/stereo_match.cpp	patch \|	\|	diff2 \|	blob \| history