update some of the functions in ocl module to the latest version
authoryao <bitwangyaoyao@gmail.com>
Mon, 3 Sep 2012 09:03:37 +0000 (17:03 +0800)
committeryao <bitwangyaoyao@gmail.com>
Mon, 3 Sep 2012 09:03:37 +0000 (17:03 +0800)
24 files changed:
modules/ocl/include/opencv2/ocl/ocl.hpp
modules/ocl/perf/test_blend.cpp [new file with mode: 0644]
modules/ocl/perf/test_canny.cpp [new file with mode: 0644]
modules/ocl/perf/test_columnsum.cpp [new file with mode: 0644]
modules/ocl/perf/test_fft.cpp [new file with mode: 0644]
modules/ocl/perf/test_gemm.cpp [new file with mode: 0644]
modules/ocl/perf/test_hog.cpp [new file with mode: 0644]
modules/ocl/perf/test_match_template.cpp [new file with mode: 0644]
modules/ocl/perf/test_pyrdown.cpp [new file with mode: 0644]
modules/ocl/perf/test_pyrup.cpp [new file with mode: 0644]
modules/ocl/src/canny.cpp
modules/ocl/src/columnsum.cpp
modules/ocl/src/hog.cpp
modules/ocl/src/kernels/blend_linear.cl
modules/ocl/src/kernels/imgproc_canny.cl
modules/ocl/src/kernels/nonfree_surf.cl
modules/ocl/src/kernels/objdetect_hog.cl
modules/ocl/src/match_template.cpp
modules/ocl/src/pyrdown.cpp
modules/ocl/src/pyrup.cpp
modules/ocl/src/surf.cpp
modules/ocl/test/test_columnsum.cpp
modules/ocl/test/test_pyrdown.cpp
modules/ocl/test/test_pyrup.cpp

index a86f373..3eff7ef 100644 (file)
@@ -877,32 +877,32 @@ namespace cv
                // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
                CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf);
 
-               ///////////////////////////////////////////// Canny /////////////////////////////////////////////
-               struct CV_EXPORTS CannyBuf;
-
-               //! compute edges of the input image using Canny operator
-               // Support CV_8UC1 only
-               CV_EXPORTS void Canny(const oclMat& image, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-               CV_EXPORTS void Canny(const oclMat& image, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-               CV_EXPORTS void Canny(const oclMat& dx, const oclMat& dy, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
-               CV_EXPORTS void Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
-
-               struct CV_EXPORTS CannyBuf
-               {
-                       CannyBuf() {}
-                       explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
-                       CannyBuf(const oclMat& dx_, const oclMat& dy_);
-
-                       void create(const Size& image_size, int apperture_size = 3);
-
-                       void release();
-
-                       oclMat dx, dy;
-                       oclMat dx_buf, dy_buf;
-                       oclMat edgeBuf;
-                       oclMat trackBuf1, trackBuf2;
-                       oclMat counter;
-                       Ptr<FilterEngine_GPU> filterDX, filterDY;
+               ///////////////////////////////////////////// Canny /////////////////////////////////////////////\r
+               struct CV_EXPORTS CannyBuf;\r
+\r
+               //! compute edges of the input image using Canny operator\r
+               // Support CV_8UC1 only\r
+               CV_EXPORTS void Canny(const oclMat& image, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);\r
+               CV_EXPORTS void Canny(const oclMat& image, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);\r
+               CV_EXPORTS void Canny(const oclMat& dx, const oclMat& dy, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);\r
+               CV_EXPORTS void Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);\r
+\r
+               struct CV_EXPORTS CannyBuf\r
+               {\r
+                       CannyBuf() {}\r
+                       explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}\r
+                       CannyBuf(const oclMat& dx_, const oclMat& dy_);\r
+\r
+                       void create(const Size& image_size, int apperture_size = 3);\r
+\r
+                       void release();\r
+\r
+                       oclMat dx, dy;\r
+                       oclMat dx_buf, dy_buf;\r
+                       oclMat edgeBuf;\r
+                       oclMat trackBuf1, trackBuf2;\r
+                       void * counter;\r
+                       Ptr<FilterEngine_GPU> filterDX, filterDY;\r
                };
 
 #ifdef HAVE_CLAMDFFT
@@ -935,154 +935,161 @@ namespace cv
                const oclMat& src3, double beta, oclMat& dst, int flags = 0);
 #endif
 
-        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
-        struct CV_EXPORTS HOGDescriptor
-        {
-            enum { DEFAULT_WIN_SIGMA = -1 };
-            enum { DEFAULT_NLEVELS = 64 };
-            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-            HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
-                          Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
-                          int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
-                          double threshold_L2hys=0.2, bool gamma_correction=true,
-                          int nlevels=DEFAULT_NLEVELS);
-
-            size_t getDescriptorSize() const;
-            size_t getBlockHistogramSize() const;
-
-            void setSVMDetector(const vector<float>& detector);
-
-            static vector<float> getDefaultPeopleDetector();
-            static vector<float> getPeopleDetector48x96();
-            static vector<float> getPeopleDetector64x128();
-
-            void detect(const oclMat& img, vector<Point>& found_locations,
-                        double hit_threshold=0, Size win_stride=Size(),
-                        Size padding=Size());
-
-            void detectMultiScale(const oclMat& img, vector<Rect>& found_locations,
-                                  double hit_threshold=0, Size win_stride=Size(),
-                                  Size padding=Size(), double scale0=1.05,
-                                  int group_threshold=2);
-
-            void getDescriptors(const oclMat& img, Size win_stride,
-                                oclMat& descriptors,
-                                int descr_format=DESCR_FORMAT_COL_BY_COL);
-
-            Size win_size;
-            Size block_size;
-            Size block_stride;
-            Size cell_size;
-            int nbins;
-            double win_sigma;
-            double threshold_L2hys;
-            bool gamma_correction;
-            int nlevels;
-
-        protected:
-            void computeBlockHistograms(const oclMat& img);
-            void computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle);
-
-            double getWinSigma() const;
-            bool checkDetectorSize() const;
-
-            static int numPartsWithin(int size, int part_size, int stride);
-            static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-            // Coefficients of the separating plane
-            float free_coef;
-            oclMat detector;
-
-            // Results of the last classification step
-            oclMat labels;
-            Mat labels_host;
-
-            // Results of the last histogram evaluation step
-            oclMat block_hists;
-
-            // Gradients conputation results
-            oclMat grad, qangle;
-
-            std::vector<oclMat> image_scales;
-        };
-
-        //! Speeded up robust features, port from GPU module.
-        ////////////////////////////////// SURF //////////////////////////////////////////
-        class CV_EXPORTS SURF_OCL
-        {
-        public:
-            enum KeypointLayout
-            {
-                X_ROW = 0,
-                Y_ROW,
-                LAPLACIAN_ROW,
-                OCTAVE_ROW,
-                SIZE_ROW,
-                ANGLE_ROW,
-                HESSIAN_ROW,
-                ROWS_COUNT
-            };
-
-            //! the default constructor
-            SURF_OCL();
-            //! the full constructor taking all the necessary parameters
-            explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
-                int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
-
-            //! returns the descriptor size in float's (64 or 128)
-            int descriptorSize() const;
-            
-            //! upload host keypoints to device memory
-            void uploadKeypoints(const vector<cv::KeyPoint>& keypoints, oclMat& keypointsocl);
-            //! download keypoints from device to host memory
-            void downloadKeypoints(const oclMat& keypointsocl, vector<KeyPoint>& keypoints);
-
-            //! download descriptors from device to host memory
-            void downloadDescriptors(const oclMat& descriptorsocl, vector<float>& descriptors);
-
-            //! finds the keypoints using fast hessian detector used in SURF
-            //! supports CV_8UC1 images
-            //! keypoints will have nFeature cols and 6 rows
-            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
-            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
-            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
-            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
-            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
-            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
-            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
-            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints);
-            //! finds the keypoints and computes their descriptors.
-            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
-            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors,
-                bool useProvidedKeypoints = false);
-
-            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints);
-            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, oclMat& descriptors,
-                bool useProvidedKeypoints = false);
-
-            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
-                bool useProvidedKeypoints = false);
-
-            void releaseMemory();
-
-            // SURF parameters
-            float hessianThreshold;
-            int nOctaves;
-            int nOctaveLayers;
-            bool extended;
-            bool upright;
-
-            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-            float keypointsRatio;
-
-            oclMat sum, mask1, maskSum, intBuffer;
-
-            oclMat det, trace;
-
-            oclMat maxPosBuffer;
-
-        };
+        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////\r
+        struct CV_EXPORTS HOGDescriptor\r
+        {\r
+            enum { DEFAULT_WIN_SIGMA = -1 };\r
+            enum { DEFAULT_NLEVELS = 64 };\r
+            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };\r
+\r
+            HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),\r
+                          Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),\r
+                          int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,\r
+                          double threshold_L2hys=0.2, bool gamma_correction=true,\r
+                          int nlevels=DEFAULT_NLEVELS);\r
+\r
+            size_t getDescriptorSize() const;\r
+            size_t getBlockHistogramSize() const;\r
+\r
+            void setSVMDetector(const vector<float>& detector);\r
+\r
+            static vector<float> getDefaultPeopleDetector();\r
+            static vector<float> getPeopleDetector48x96();\r
+            static vector<float> getPeopleDetector64x128();\r
+\r
+            void detect(const oclMat& img, vector<Point>& found_locations,\r
+                        double hit_threshold=0, Size win_stride=Size(),\r
+                        Size padding=Size());\r
+\r
+            void detectMultiScale(const oclMat& img, vector<Rect>& found_locations,\r
+                                  double hit_threshold=0, Size win_stride=Size(),\r
+                                  Size padding=Size(), double scale0=1.05,\r
+                                  int group_threshold=2);\r
+\r
+            void getDescriptors(const oclMat& img, Size win_stride,\r
+                                oclMat& descriptors,\r
+                                int descr_format=DESCR_FORMAT_COL_BY_COL);\r
+\r
+            Size win_size;\r
+            Size block_size;\r
+            Size block_stride;\r
+            Size cell_size;\r
+            int nbins;\r
+            double win_sigma;\r
+            double threshold_L2hys;\r
+            bool gamma_correction;\r
+            int nlevels;\r
+\r
+        protected:\r
+            // initialize buffers; only need to do once in case of multiscale detection\r
+            void init_buffer(const oclMat& img, Size win_stride);\r
+\r
+            void computeBlockHistograms(const oclMat& img);\r
+            void computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle);\r
+\r
+            double getWinSigma() const;\r
+            bool checkDetectorSize() const;\r
+\r
+            static int numPartsWithin(int size, int part_size, int stride);\r
+            static Size numPartsWithin(Size size, Size part_size, Size stride);\r
+\r
+            // Coefficients of the separating plane\r
+            float free_coef;\r
+            oclMat detector;\r
+\r
+            // Results of the last classification step\r
+            oclMat labels;\r
+            Mat labels_host;\r
+\r
+            // Results of the last histogram evaluation step\r
+            oclMat block_hists;\r
+\r
+            // Gradients conputation results\r
+            oclMat grad, qangle;\r
+\r
+            // scaled image\r
+            oclMat image_scale;\r
+\r
+            // effect size of input image (might be different from original size after scaling)\r
+            Size effect_size;\r
+        };\r
+
+        //! Speeded up robust features, port from GPU module.\r
+        ////////////////////////////////// SURF //////////////////////////////////////////\r
+        class CV_EXPORTS SURF_OCL\r
+        {\r
+        public:\r
+            enum KeypointLayout\r
+            {\r
+                X_ROW = 0,\r
+                Y_ROW,\r
+                LAPLACIAN_ROW,\r
+                OCTAVE_ROW,\r
+                SIZE_ROW,\r
+                ANGLE_ROW,\r
+                HESSIAN_ROW,\r
+                ROWS_COUNT\r
+            };\r
+\r
+            //! the default constructor\r
+            SURF_OCL();\r
+            //! the full constructor taking all the necessary parameters\r
+            explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,\r
+                int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);\r
+\r
+            //! returns the descriptor size in float's (64 or 128)\r
+            int descriptorSize() const;\r
+            \r
+            //! upload host keypoints to device memory\r
+            void uploadKeypoints(const vector<cv::KeyPoint>& keypoints, oclMat& keypointsocl);\r
+            //! download keypoints from device to host memory\r
+            void downloadKeypoints(const oclMat& keypointsocl, vector<KeyPoint>& keypoints);\r
+\r
+            //! download descriptors from device to host memory\r
+            void downloadDescriptors(const oclMat& descriptorsocl, vector<float>& descriptors);\r
+\r
+            //! finds the keypoints using fast hessian detector used in SURF\r
+            //! supports CV_8UC1 images\r
+            //! keypoints will have nFeature cols and 6 rows\r
+            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature\r
+            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature\r
+            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature\r
+            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature\r
+            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature\r
+            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature\r
+            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature\r
+            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints);\r
+            //! finds the keypoints and computes their descriptors.\r
+            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction\r
+            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors,\r
+                bool useProvidedKeypoints = false);\r
+\r
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints);\r
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, oclMat& descriptors,\r
+                bool useProvidedKeypoints = false);\r
+\r
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,\r
+                bool useProvidedKeypoints = false);\r
+\r
+            void releaseMemory();\r
+\r
+            // SURF parameters\r
+            float hessianThreshold;\r
+            int nOctaves;\r
+            int nOctaveLayers;\r
+            bool extended;\r
+            bool upright;\r
+\r
+            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)\r
+            float keypointsRatio;\r
+\r
+            oclMat sum, mask1, maskSum, intBuffer;\r
+\r
+            oclMat det, trace;\r
+\r
+            oclMat maxPosBuffer;\r
+\r
+        };\r
     }
 }
 #include "opencv2/ocl/matrix_operations.hpp"
diff --git a/modules/ocl/perf/test_blend.cpp b/modules/ocl/perf/test_blend.cpp
new file mode 100644 (file)
index 0000000..ad5b402
--- /dev/null
@@ -0,0 +1,122 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+PARAM_TEST_CASE(Blend, MatType, int)
+{
+       int type;
+       int channels;
+       std::vector<cv::ocl::Info> oclinfo;
+       
+       virtual void SetUp()
+       {
+       
+               type = GET_PARAM(0);
+               channels = GET_PARAM(1);
+               //int devnums = getDevice(oclinfo);
+               //CV_Assert(devnums > 0);
+               //cv::ocl::setBinpath(CLBINPATH);
+       }
+};
+
+TEST_P(Blend, Performance)
+{
+       cv::Size size(MWIDTH, MHEIGHT);
+       cv::Mat img1_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
+       cv::Mat img2_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
+       cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
+       cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
+       cv::ocl::oclMat gimg1(size, CV_MAKETYPE(type, channels)), gimg2(size, CV_MAKETYPE(type, channels)), gweights1(size, CV_32F), gweights2(size, CV_32F);
+       cv::ocl::oclMat gdst(size, CV_MAKETYPE(type, channels));
+       
+       
+       double totalgputick_all = 0;
+       double totalgputick_kernel = 0;
+       double t1 = 0;
+       double t2 = 0;
+       
+       for (int j = 0; j < LOOP_TIMES + 1; j ++) //LOOP_TIMES=100
+       {
+               t1 = (double)cvGetTickCount();
+               cv::ocl::oclMat gimg1 = cv::ocl::oclMat(img1_host);
+               cv::ocl::oclMat gimg2 = cv::ocl::oclMat(img2_host);
+               cv::ocl::oclMat gweights1 = cv::ocl::oclMat(weights1);
+               cv::ocl::oclMat gweights2 = cv::ocl::oclMat(weights1);
+               
+               t2 = (double)cvGetTickCount();
+               cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, gdst);
+               t2 = (double)cvGetTickCount() - t2;
+               
+               cv::Mat m;
+               gdst.download(m);
+               t1 = (double)cvGetTickCount() - t1;
+               
+               if (j == 0)
+               {
+                       continue;
+               }
+               
+               totalgputick_all = t1 + totalgputick_all;
+               totalgputick_kernel = t2 + totalgputick_kernel;
+       };
+       
+       cout << "average gpu total  runtime is  " << totalgputick_all / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+       
+       cout << "average gpu runtime without data transfering  is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+       
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
+                            Values(CV_8U, CV_32F), Values(1, 4)));
+#endif
\ No newline at end of file
diff --git a/modules/ocl/perf/test_canny.cpp b/modules/ocl/perf/test_canny.cpp
new file mode 100644 (file)
index 0000000..8eff35f
--- /dev/null
@@ -0,0 +1,155 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+#define FILTER_IMAGE "../../../samples/gpu/road.png"
+
+#ifndef MWC_TEST_UTILITY
+#define MWC_TEST_UTILITY
+
+// Param class
+#ifndef IMPLEMENT_PARAM_CLASS
+#define IMPLEMENT_PARAM_CLASS(name, type) \
+class name \
+       { \
+       public: \
+       name ( type arg = type ()) : val_(arg) {} \
+       operator type () const {return val_;} \
+       private: \
+       type val_; \
+       }; \
+       inline void PrintTo( name param, std::ostream* os) \
+       { \
+       *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
+       }
+
+IMPLEMENT_PARAM_CLASS(Channels, int)
+#endif // IMPLEMENT_PARAM_CLASS
+#endif // MWC_TEST_UTILITY
+
+////////////////////////////////////////////////////////
+// Canny1
+
+IMPLEMENT_PARAM_CLASS(AppertureSize, int);
+IMPLEMENT_PARAM_CLASS(L2gradient, bool);
+
+PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
+{
+       int apperture_size;
+       bool useL2gradient;
+       //std::vector<cv::ocl::Info> oclinfo;
+
+       virtual void SetUp()
+       {
+               apperture_size = GET_PARAM(0);
+               useL2gradient = GET_PARAM(1);
+               
+               //int devnums = getDevice(oclinfo);
+               //CV_Assert(devnums > 0);
+       }
+};
+
+TEST_P(Canny1, Performance)
+{
+       cv::Mat img = readImage(FILTER_IMAGE,cv::IMREAD_GRAYSCALE);
+       ASSERT_FALSE(img.empty());
+
+       double low_thresh = 100.0;
+       double high_thresh = 150.0;
+
+       cv::Mat edges_gold;
+       cv::ocl::oclMat edges;
+
+    double totalgputick=0;
+       double totalgputick_kernel=0;
+       
+       double t1=0;
+       double t2=0;
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1              
+                       
+               cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);//upload
+                       
+               t2=(double)cvGetTickCount();//kernel
+               cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+                       
+               cv::Mat cpu_dst;
+               edges.download (cpu_dst);//download
+                       
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+
+               if(j == 0)
+                       continue;
+
+               totalgputick=t1+totalgputick;
+
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny1, testing::Combine(
+                                               testing::Values(AppertureSize(3), AppertureSize(5)),
+                                               testing::Values(L2gradient(false), L2gradient(true))));
+
+
+
+#endif  //Have opencl
\ No newline at end of file
diff --git a/modules/ocl/perf/test_columnsum.cpp b/modules/ocl/perf/test_columnsum.cpp
new file mode 100644 (file)
index 0000000..c1f23fc
--- /dev/null
@@ -0,0 +1,120 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//        Fangfang Bai fangfang@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+///////////////////////////////////////////////////////////////////////////////
+/// ColumnSum
+
+#ifdef HAVE_OPENCL
+
+////////////////////////////////////////////////////////////////////////
+// ColumnSum
+
+PARAM_TEST_CASE(ColumnSum)
+{
+       cv::Mat src;
+       //std::vector<cv::ocl::Info> oclinfo;
+
+       virtual void SetUp()
+       {
+               //int devnums = getDevice(oclinfo);
+               //CV_Assert(devnums > 0);
+       }
+};
+
+TEST_F(ColumnSum, Performance)
+{
+       cv::Size size(MWIDTH,MHEIGHT);
+    cv::Mat src = randomMat(size, CV_32FC1);
+    cv::ocl::oclMat d_dst;
+
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+       double t1=0;
+       double t2=0;
+
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+        cv::ocl::oclMat d_src(src);            
+
+               t2=(double)cvGetTickCount();//kernel
+               cv::ocl::columnSum(d_src,d_dst);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+
+               cv::Mat cpu_dst;
+               d_dst.download (cpu_dst);//download
+
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+
+               if(j == 0)
+                       continue;
+
+               totalgputick=t1+totalgputick;
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+
+
+}
+
+
+
+#endif 
\ No newline at end of file
diff --git a/modules/ocl/perf/test_fft.cpp b/modules/ocl/perf/test_fft.cpp
new file mode 100644 (file)
index 0000000..6b929f4
--- /dev/null
@@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfangbai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDFFT
+////////////////////////////////////////////////////////////////////////////
+// Dft
+PARAM_TEST_CASE(Dft, cv::Size, bool) 
+{
+       cv::Size dft_size;
+       bool     dft_rows;
+       vector<cv::ocl::Info> info;
+       virtual void SetUp()
+       {
+               dft_size = GET_PARAM(0);
+               dft_rows = GET_PARAM(1);
+               cv::ocl::getDevice(info);
+       }
+};
+
+TEST_P(Dft, C2C)
+{
+       cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
+       int flags = 0;
+       flags |= dft_rows ? cv::DFT_ROWS : 0;
+
+       cv::ocl::oclMat d_b;
+
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+       double t1=0;
+       double t2=0;
+
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+               cv::ocl::oclMat ga=cv::ocl::oclMat(a);//upload
+
+               t2=(double)cvGetTickCount();//kernel
+               cv::ocl::dft(ga, d_b, a.size(), flags);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+
+               cv::Mat cpu_dst;
+               d_b.download (cpu_dst);//download
+
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+
+               if(j == 0)
+                       continue;
+
+               totalgputick=t1+totalgputick;   
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+
+
+
+TEST_P(Dft, R2CthenC2R)
+{
+       cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
+
+       int flags = 0;
+       //flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
+
+       cv::ocl::oclMat d_b, d_c;
+
+       cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
+       cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
+
+       EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
+}
+
+//INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
+//                                             testing::Values(cv::Size(1280, 1024), cv::Size(1920, 1080),cv::Size(1800, 1500)),
+//                                             testing::Values(false, true)));
+
+#endif // HAVE_CLAMDFFT
diff --git a/modules/ocl/perf/test_gemm.cpp b/modules/ocl/perf/test_gemm.cpp
new file mode 100644 (file)
index 0000000..6cdbc47
--- /dev/null
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDBLAS
+////////////////////////////////////////////////////////////////////////////
+// GEMM
+PARAM_TEST_CASE(Gemm, int, cv::Size, int) 
+{
+       int      type;
+       cv::Size mat_size;
+       int              flags;
+       vector<cv::ocl::Info> info;
+       virtual void SetUp()
+       {
+               type     = GET_PARAM(0);
+               mat_size = GET_PARAM(1);
+               flags    = GET_PARAM(2);
+
+               cv::ocl::getDevice(info);
+       }
+};
+
+TEST_P(Gemm, Performance)
+{
+       cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
+       cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
+       cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
+       cv::ocl::oclMat ocl_dst;        
+
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+       double t1=0;
+       double t2=0;
+
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+               cv::ocl::oclMat ga = cv::ocl::oclMat(a);//upload
+               cv::ocl::oclMat gb = cv::ocl::oclMat(b);//upload
+               cv::ocl::oclMat gc = cv::ocl::oclMat(c);//upload
+
+               t2=(double)cvGetTickCount();//kernel
+               cv::ocl::gemm(ga, gb, 1.0,gc, 1.0, ocl_dst, flags);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+
+               cv::Mat cpu_dst;
+               ocl_dst.download (cpu_dst);//download
+
+               t1 = (double)cvGetTickCount() - t1;//gpu end
+
+               if(j == 0)
+                       continue;
+
+               totalgputick=t1+totalgputick;   
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+
+
+INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
+                                               testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
+                                               testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
+                                               testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T)));
+#endif
\ No newline at end of file
diff --git a/modules/ocl/perf/test_hog.cpp b/modules/ocl/perf/test_hog.cpp
new file mode 100644 (file)
index 0000000..9abcb20
--- /dev/null
@@ -0,0 +1,218 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//             Fangfang BAI, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/core.hpp"
+#include <iomanip>
+using namespace std;
+
+
+
+#ifdef HAVE_OPENCL
+
+
+PARAM_TEST_CASE(HOG,cv::Size,int)
+{
+       cv::Size winSize;
+       int type;
+       std::vector<cv::ocl::Info> oclinfo;
+
+       virtual void SetUp()
+       {
+               winSize = GET_PARAM(0);
+               type = GET_PARAM(1);
+               int devnums = getDevice(oclinfo);
+               CV_Assert(devnums > 0);
+       }
+};
+
+TEST_P(HOG, GetDescriptors)
+{
+       // Load image
+       cv::Mat img_rgb = readImage("D:road.png");
+       ASSERT_FALSE(img_rgb.empty());
+
+       // Convert image
+       cv::Mat img;
+       switch (type)
+       {
+       case CV_8UC1:
+               cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+               break;
+       case CV_8UC4:
+       default:
+               cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+               break;
+       }
+               // HOGs
+       cv::ocl::HOGDescriptor ocl_hog;
+       ocl_hog.gamma_correction = true;
+
+
+       // Compute descriptor
+       cv::ocl::oclMat d_descriptors;
+       //down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
+
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+       double t1=0;
+       double t2=0;
+
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+               cv::ocl::oclMat d_img=cv::ocl::oclMat(img);//upload
+
+               t2=(double)cvGetTickCount();//kernel
+               ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+
+               cv::Mat down_descriptors;
+               d_descriptors.download(down_descriptors);
+
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+
+               if(j == 0)
+                       continue;
+
+               totalgputick=t1+totalgputick;
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+       
+}
+
+
+TEST_P(HOG, Detect)
+{
+       // Load image
+       cv::Mat img_rgb = readImage("D:road.png");
+       ASSERT_FALSE(img_rgb.empty());
+
+       // Convert image
+       cv::Mat img;
+       switch (type)
+       {
+       case CV_8UC1:
+               cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+               break;
+       case CV_8UC4:
+       default:
+               cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+               break;
+       }
+       
+    // HOGs
+       if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
+               winSize = cv::Size(64, 128);
+       cv::ocl::HOGDescriptor ocl_hog(winSize);
+       ocl_hog.gamma_correction = true;
+
+       cv::HOGDescriptor hog;
+       hog.winSize = winSize;
+       hog.gammaCorrection = true;
+
+       if (winSize.width == 48 && winSize.height == 96)
+       {
+               // daimler's base
+               ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96());
+               hog.setSVMDetector(hog.getDaimlerPeopleDetector());
+       }
+       else if (winSize.width == 64 && winSize.height == 128)
+       {
+               ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128());
+               hog.setSVMDetector(hog.getDefaultPeopleDetector());
+       }
+       else
+       {
+               ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+               hog.setSVMDetector(hog.getDefaultPeopleDetector());
+       }
+
+       // OpenCL detection
+       std::vector<cv::Point> d_v_locations;
+
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+       double t1=0;
+       double t2=0;
+
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+               cv::ocl::oclMat d_img=cv::ocl::oclMat(img);//upload
+
+               t2=(double)cvGetTickCount();//kernel
+               ocl_hog.detect(d_img, d_v_locations, 0);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+        
+               t1 = (double)cvGetTickCount() - t1;//gpu end1           
+               if(j == 0)
+                       continue;
+               totalgputick=t1+totalgputick;
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+}
+
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
+                                               testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+                                               testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
+
+
+#endif //HAVE_OPENCL
diff --git a/modules/ocl/perf/test_match_template.cpp b/modules/ocl/perf/test_match_template.cpp
new file mode 100644 (file)
index 0000000..1e6b0f7
--- /dev/null
@@ -0,0 +1,232 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+#ifndef MWC_TEST_UTILITY
+#define MWC_TEST_UTILITY
+//////// Utility
+#ifndef DIFFERENT_SIZES
+#else
+#undef DIFFERENT_SIZES
+#endif
+#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000))
+
+// Param class
+#ifndef IMPLEMENT_PARAM_CLASS
+#define IMPLEMENT_PARAM_CLASS(name, type) \
+class name \
+{ \
+public: \
+       name ( type arg = type ()) : val_(arg) {} \
+       operator type () const {return val_;} \
+private: \
+       type val_; \
+}; \
+       inline void PrintTo( name param, std::ostream* os) \
+{ \
+       *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
+}
+
+IMPLEMENT_PARAM_CLASS(Channels, int)
+#endif // IMPLEMENT_PARAM_CLASS
+#endif // MWC_TEST_UTILITY
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate
+#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
+
+IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+
+const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
+
+PARAM_TEST_CASE(MatchTemplate, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+       cv::Size size;
+       cv::Size templ_size;
+       int cn;
+       int method;
+       //vector<cv::ocl::Info> oclinfo;
+
+       virtual void SetUp()
+       {
+               size = GET_PARAM(0);
+               templ_size = GET_PARAM(1);
+               cn = GET_PARAM(2);
+               method = GET_PARAM(3);
+               //int devnums = getDevice(oclinfo);
+               //CV_Assert(devnums > 0);
+       }
+};
+struct MatchTemplate8U : MatchTemplate {};
+
+TEST_P(MatchTemplate8U, Performance)
+{
+       std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+       std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+       std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+       std::cout << "Channels: " << cn << std::endl;
+
+       cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
+       cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
+       cv::Mat dst_gold;
+       cv::ocl::oclMat dst;
+
+
+
+
+       
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+
+       double t1=0;
+       double t2=0;
+       for(int j = 0; j < LOOP_TIMES+1; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
+               cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
+
+               t2=(double)cvGetTickCount();//kernel
+               cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+
+               cv::Mat cpu_dst;
+               dst.download (cpu_dst);//download
+
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+
+               if(j == 0)
+                       continue;
+
+               totalgputick=t1+totalgputick;   
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+       cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+
+}
+
+
+struct MatchTemplate32F : MatchTemplate {};
+TEST_P(MatchTemplate32F, Performance)
+{
+       std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+       std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+       std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+       std::cout << "Channels: " << cn << std::endl;
+       cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
+       cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+
+       cv::Mat dst_gold;
+       cv::ocl::oclMat dst;
+
+
+
+
+       double totalgputick=0;
+       double totalgputick_kernel=0;
+
+       double t1=0;
+       double t2=0;
+       for(int j = 0; j < LOOP_TIMES; j ++)
+       {
+
+               t1 = (double)cvGetTickCount();//gpu start1
+
+        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
+               cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
+
+               t2=(double)cvGetTickCount();//kernel
+               cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+
+               cv::Mat cpu_dst;
+               dst.download (cpu_dst);//download
+
+               t1 = (double)cvGetTickCount() - t1;//gpu end1           
+
+               totalgputick=t1+totalgputick;
+       
+               totalgputick_kernel=t2+totalgputick_kernel;     
+
+       }
+
+   cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+   cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+
+
+}
+
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, 
+       testing::Combine(
+    testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT),cv::Size(1800, 1500)),
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4)/*, Channels(3)*/),
+       ALL_TEMPLATE_METHODS
+       )
+);
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+    testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT),cv::Size(1800, 1500)),
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4) /*, Channels(3)*/),
+    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+
+#endif //HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ocl/perf/test_pyrdown.cpp b/modules/ocl/perf/test_pyrdown.cpp
new file mode 100644 (file)
index 0000000..5d92a21
--- /dev/null
@@ -0,0 +1,137 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    fangfang bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+PARAM_TEST_CASE(PyrDown, MatType, int)
+{
+       int type;
+       int channels;
+       //src mat
+       cv::Mat mat1;
+       cv::Mat dst;
+       
+       //std::vector<cv::ocl::Info> oclinfo;
+       //ocl dst mat for testing
+       
+       cv::ocl::oclMat gmat1;
+       cv::ocl::oclMat gdst;
+       
+       
+       virtual void SetUp()
+       {
+               type = GET_PARAM(0);
+               channels = GET_PARAM(1);
+               //int devnums = getDevice(oclinfo);
+               //CV_Assert(devnums > 0);
+       }
+       
+       
+};
+
+#define VARNAME(A) string(#A);
+
+////////////////////////////////PyrDown/////////////////////////////////////////////////
+TEST_P(PyrDown, Mat)
+{
+       cv::Size size(MWIDTH, MHEIGHT);
+       cv::RNG &rng = TS::ptr()->get_rng();
+       mat1 = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
+       
+       
+       cv::ocl::oclMat gdst;
+       double totalgputick = 0;
+       double totalgputick_kernel = 0;
+       
+       double t1 = 0;
+       double t2 = 0;
+       
+       for (int j = 0; j < LOOP_TIMES + 1; j ++)
+       {
+       
+               t1 = (double)cvGetTickCount();//gpu start1
+               
+               cv::ocl::oclMat gmat1(mat1);
+               
+               t2 = (double)cvGetTickCount(); //kernel
+               cv::ocl::pyrDown(gmat1, gdst);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+               
+               cv::Mat cpu_dst;
+               gdst.download(cpu_dst);
+               
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+               
+               if (j == 0)
+               {
+                       continue;
+               }
+               
+               totalgputick = t1 + totalgputick;
+               
+               totalgputick_kernel = t2 + totalgputick_kernel;
+               
+       }
+       
+       cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+       
+}
+
+//********test****************
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
+                            Values(CV_8U, CV_32F), Values(1, 4)));
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/test_pyrup.cpp b/modules/ocl/perf/test_pyrup.cpp
new file mode 100644 (file)
index 0000000..5cefba7
--- /dev/null
@@ -0,0 +1,122 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    fangfang bai fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/core.hpp"
+#include "precomp.hpp"
+#include <iomanip>
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+
+PARAM_TEST_CASE(PyrUp, MatType, int)
+{
+       int type;
+       int channels;
+       //std::vector<cv::ocl::Info> oclinfo;
+       
+       virtual void SetUp()
+       {
+               type = GET_PARAM(0);
+               channels = GET_PARAM(1);
+               //int devnums = getDevice(oclinfo);
+               //CV_Assert(devnums > 0);
+       }
+};
+
+TEST_P(PyrUp, Performance)
+{
+       cv::Size size(MWIDTH, MHEIGHT);
+       cv::Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+       cv::Mat dst_gold;
+       cv::ocl::oclMat dst;
+       
+       
+       double totalgputick = 0;
+       double totalgputick_kernel = 0;
+       
+       double t1 = 0;
+       double t2 = 0;
+       
+       for (int j = 0; j < LOOP_TIMES + 1; j ++)
+       {
+               t1 = (double)cvGetTickCount();//gpu start1
+               
+               cv::ocl::oclMat srcMat = cv::ocl::oclMat(src);//upload
+               
+               t2 = (double)cvGetTickCount(); //kernel
+               cv::ocl::pyrUp(srcMat, dst);
+               t2 = (double)cvGetTickCount() - t2;//kernel
+               
+               cv::Mat cpu_dst;
+               dst.download(cpu_dst); //download
+               
+               t1 = (double)cvGetTickCount() - t1;//gpu end1
+               
+               if (j == 0)
+               {
+                       continue;
+               }
+               
+               totalgputick = t1 + totalgputick;
+               
+               totalgputick_kernel = t2 + totalgputick_kernel;
+               
+       }
+       
+       
+       cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+       cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+       
+       
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, Combine(
+                            Values(CV_8U, CV_32F), Values(1, 4)));
+
+#endif // HAVE_OPENCL
\ No newline at end of file
index f41d953..5881a4c 100644 (file)
@@ -59,11 +59,11 @@ void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& e
 
 namespace cv
 {
-       namespace ocl
-       {
-               ///////////////////////////OpenCL kernel strings///////////////////////////
-               extern const char *imgproc_canny;
-       }
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *imgproc_canny;
+    }
 }
 
 cv::ocl::CannyBuf::CannyBuf(const oclMat& dx_, const oclMat& dy_) : dx(dx_), dy(dy_)
@@ -75,32 +75,35 @@ cv::ocl::CannyBuf::CannyBuf(const oclMat& dx_, const oclMat& dy_) : dx(dx_), dy(
 
 void cv::ocl::CannyBuf::create(const Size& image_size, int apperture_size)
 {
-       dx.create(image_size, CV_32SC1);
-       dy.create(image_size, CV_32SC1);
-
-       if(apperture_size == 3)
-       {
-               dx_buf.create(image_size, CV_32SC1);
-               dy_buf.create(image_size, CV_32SC1);
-       }
-       else if(apperture_size > 0)
+    dx.create(image_size, CV_32SC1);
+    dy.create(image_size, CV_32SC1);
+
+    if(apperture_size == 3)
+    {
+        dx_buf.create(image_size, CV_32SC1);
+        dy_buf.create(image_size, CV_32SC1);
+    }
+    else if(apperture_size > 0)
     {
-               Mat kx, ky;
+        Mat kx, ky;
         if (!filterDX)
-               {
-                       filterDX = createDerivFilter_GPU(CV_32F, CV_32F, 1, 0, apperture_size, BORDER_REPLICATE);
-               }
+        {
+            filterDX = createDerivFilter_GPU(CV_8U, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
+        }
         if (!filterDY)
-               {
-            filterDY = createDerivFilter_GPU(CV_32F, CV_32F, 0, 1, apperture_size, BORDER_REPLICATE);
-               }
+        {
+            filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+        }
     }
-       edgeBuf.create(image_size.height + 2, image_size.width + 2, CV_32FC1);
-       
-       trackBuf1.create(1, image_size.width * image_size.height, CV_16UC2);
-       trackBuf2.create(1, image_size.width * image_size.height, CV_16UC2);
+    edgeBuf.create(image_size.height + 2, image_size.width + 2, CV_32FC1);
 
-       counter.create(1,1, CV_32SC1);
+    trackBuf1.create(1, image_size.width * image_size.height, CV_16UC2);
+    trackBuf2.create(1, image_size.width * image_size.height, CV_16UC2);
+
+    float counter_f [1] = { 0 };
+    int err = 0;
+    counter = clCreateBuffer( Context::getContext()->impl->clContext, CL_MEM_COPY_HOST_PTR, sizeof(float), counter_f, &err );
+    openCLSafeCall(err);
 }
 
 void cv::ocl::CannyBuf::release()
@@ -112,12 +115,12 @@ void cv::ocl::CannyBuf::release()
     edgeBuf.release();
     trackBuf1.release();
     trackBuf2.release();
-       counter.release();
+    openCLFree(counter);
 }
 
 namespace cv { namespace ocl {
-       namespace canny
-       {
+    namespace canny
+    {
         void calcSobelRowPass_gpu(const oclMat& src, oclMat& dx_buf, oclMat& dy_buf, int rows, int cols);
 
         void calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat& dx, oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad);
@@ -125,12 +128,12 @@ namespace cv { namespace ocl {
 
         void calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh);
 
-        void edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter, int rows, int cols);
+        void edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, int rows, int cols);
 
-        void edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, oclMat& counter, int rows, int cols);
+        void edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, void * counter, int rows, int cols);
 
         void getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols);
-       }
+    }
 }}// cv::ocl
 
 namespace
@@ -164,11 +167,10 @@ void cv::ocl::Canny(const oclMat& src, CannyBuf& buf, oclMat& dst, double low_th
         std::swap( low_thresh, high_thresh );
 
     dst.create(src.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
+    //dst.setTo(Scalar::all(0));
 
     buf.create(src.size(), apperture_size);
-    buf.edgeBuf.setTo(Scalar::all(0));
-       buf.counter.setTo(Scalar::all(0));
+    //buf.edgeBuf.setTo(Scalar::all(0));
 
     if (apperture_size == 3)
     {
@@ -178,17 +180,8 @@ void cv::ocl::Canny(const oclMat& src, CannyBuf& buf, oclMat& dst, double low_th
     }
     else
     {
-               // FIXME:
-               // current ocl implementation requires the src and dst having same type
-               // convertTo is time consuming so this may be optimized later.
-               oclMat src_omat32f = src;
-               src.convertTo(src_omat32f, CV_32F); // FIXME
-
-        buf.filterDX->apply(src_omat32f, buf.dx);
-        buf.filterDY->apply(src_omat32f, buf.dy);
-
-               buf.dx.convertTo(buf.dx, CV_32S); // FIXME
-               buf.dy.convertTo(buf.dy, CV_32S); // FIXME
+        buf.filterDX->apply(src, buf.dx);
+        buf.filterDY->apply(src, buf.dy);
 
         calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
     }
@@ -210,12 +203,11 @@ void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& d
         std::swap( low_thresh, high_thresh);
 
     dst.create(dx.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
+    //dst.setTo(Scalar::all(0));
 
     buf.dx = dx; buf.dy = dy;
     buf.create(dx.size(), -1);
-    buf.edgeBuf.setTo(Scalar::all(0));
-       buf.counter.setTo(Scalar::all(0));
+    //buf.edgeBuf.setTo(Scalar::all(0));
     calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
 
     CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
@@ -223,197 +215,197 @@ void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& d
 
 void canny::calcSobelRowPass_gpu(const oclMat& src, oclMat& dx_buf, oclMat& dy_buf, int rows, int cols)
 {
-       Context *clCxt = src.clCxt;
-       string kernelName = "calcSobelRowPass";
-       vector< pair<size_t, const void *> > args;
-
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
-
-       size_t globalThreads[3] = {cols, rows, 1};
-       size_t localThreads[3]  = {16, 16, 1};
-       openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    Context *clCxt = src.clCxt;
+    string kernelName = "calcSobelRowPass";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
+
+    size_t globalThreads[3] = {cols, rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void canny::calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat& dx, oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
 {
-       Context *clCxt = dx_buf.clCxt;
-       string kernelName = "calcMagnitude_buf";
-       vector< pair<size_t, const void *> > args;
-
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
-
-       size_t globalThreads[3] = {cols, rows, 1};
-       size_t localThreads[3]  = {16, 16, 1};
-
-       char build_options [15] = "";
-       if(L2Grad)
-       {
-               strcat(build_options, "-D L2GRAD");
-       }
-       openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    Context *clCxt = dx_buf.clCxt;
+    string kernelName = "calcMagnitude_buf";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
+
+    size_t globalThreads[3] = {cols, rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    char build_options [15] = "";
+    if(L2Grad)
+    {
+        strcat(build_options, "-D L2GRAD");
+    }
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }
 void canny::calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
 {
-       Context *clCxt = dx.clCxt;
-       string kernelName = "calcMagnitude";
-       vector< pair<size_t, const void *> > args;
-
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
-
-       size_t globalThreads[3] = {cols, rows, 1};
-       size_t localThreads[3]  = {16, 16, 1};
-
-       char build_options [15] = "";
-       if(L2Grad)
-       {
-               strcat(build_options, "-D L2GRAD");
-       }
-       openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    Context *clCxt = dx.clCxt;
+    string kernelName = "calcMagnitude";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
+
+    size_t globalThreads[3] = {cols, rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    char build_options [15] = "";
+    if(L2Grad)
+    {
+        strcat(build_options, "-D L2GRAD");
+    }
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }
 
 void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh)
 {
-       Context *clCxt = dx.clCxt;
-       
-       vector< pair<size_t, const void *> > args;
-
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-       args.push_back( make_pair( sizeof(cl_float), (void *)&low_thresh));
-       args.push_back( make_pair( sizeof(cl_float), (void *)&high_thresh));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+    Context *clCxt = dx.clCxt;
+
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+    args.push_back( make_pair( sizeof(cl_float), (void *)&low_thresh));
+    args.push_back( make_pair( sizeof(cl_float), (void *)&high_thresh));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
 
 #if CALCMAP_FIXED
-       size_t globalThreads[3] = {cols, rows, 1};
-       string kernelName = "calcMap";
-       size_t localThreads[3]  = {16, 16, 1};
+    size_t globalThreads[3] = {cols, rows, 1};
+    string kernelName = "calcMap";
+    size_t localThreads[3]  = {16, 16, 1};
 #else
-       size_t globalThreads[3] = {cols, rows, 1};
-       string kernelName = "calcMap_2";
-       size_t localThreads[3]  = {256, 1, 1};
+    size_t globalThreads[3] = {cols, rows, 1};
+    string kernelName = "calcMap_2";
+    size_t localThreads[3]  = {256, 1, 1};
 #endif
-       openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
-void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter, int rows, int cols)
+void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, int rows, int cols)
 {
-       Context *clCxt = map.clCxt;
-       string kernelName = "edgesHysteresisLocal";
-       vector< pair<size_t, const void *> > args;
-
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
-
-       size_t globalThreads[3] = {cols, rows, 1};
-       size_t localThreads[3]  = {16, 16, 1};
-
-       openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    Context *clCxt = map.clCxt;
+    string kernelName = "edgesHysteresisLocal";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&counter));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+
+    size_t globalThreads[3] = {cols, rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
-void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, oclMat& counter, int rows, int cols)
+void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, void * counter, int rows, int cols)
 {
-       unsigned int count = Mat(counter).at<unsigned int>(0);
-
-       Context *clCxt = map.clCxt;
-       string kernelName = "edgesHysteresisGlobal";
-       vector< pair<size_t, const void *> > args;
-       size_t localThreads[3]  = {128, 1, 1};
+    unsigned int count;
+    openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, NULL, NULL, NULL));
+    Context *clCxt = map.clCxt;
+    string kernelName = "edgesHysteresisGlobal";
+    vector< pair<size_t, const void *> > args;
+    size_t localThreads[3]  = {128, 1, 1};
 
 #define DIVUP(a, b) ((a)+(b)-1)/(b)
 
-       while(count > 0)
-       {
-               counter.setTo(0);
-               args.clear();
-               size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&count));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
-
-               openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-               count = Mat(counter).at<unsigned int>(0);
-               std::swap(st1, st2);
-       }
+    while(count > 0)
+    {
+        //counter.setTo(0);
+        args.clear();
+        size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&counter));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&count));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+
+        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+        openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, NULL, NULL, NULL));
+        std::swap(st1, st2);
+    }
 #undef DIVUP
 }
 
 void canny::getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols)
 {
-       Context *clCxt = map.clCxt;
-       string kernelName = "getEdges";
-       vector< pair<size_t, const void *> > args;
-
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
-       args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
-       args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
-
-       size_t globalThreads[3] = {cols, rows, 1};
-       size_t localThreads[3]  = {16, 16, 1};
-
-       openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    Context *clCxt = map.clCxt;
+    string kernelName = "getEdges";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+
+    size_t globalThreads[3] = {cols, rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 #endif // HAVE_OPENCL
index e789d38..245082a 100644 (file)
@@ -67,7 +67,9 @@ namespace cv
 
 void cv::ocl::columnSum(const oclMat& src,oclMat& dst)
 {
-       CV_Assert(src.type() == CV_32FC1 && dst.type() == CV_32FC1 && src.size() == dst.size());
+       CV_Assert(src.type() == CV_32FC1);
+
+       dst.create(src.size(), src.type());
 
        Context *clCxt = src.clCxt;                                        
                       
index 1f8a925..ea0ba68 100644 (file)
@@ -119,6 +119,8 @@ namespace cv { namespace ocl { namespace device
                                     float angle_scale, cv::ocl::oclMat& grad, cv::ocl::oclMat& qangle, bool correct_gamma);
         void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat& img,
                                     float angle_scale, cv::ocl::oclMat& grad, cv::ocl::oclMat& qangle, bool correct_gamma);
+
+        void resize( const oclMat &src, oclMat &dst, const Size sz);
     }
 }}}
 
@@ -150,6 +152,8 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
 
     cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
     hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
+
+    effect_size = Size(0, 0);
 }
 
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -199,22 +203,37 @@ void cv::ocl::HOGDescriptor::setSVMDetector(const vector<float>& _detector)
     CV_Assert(checkDetectorSize());
 }
 
-void cv::ocl::HOGDescriptor::computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle)
+void cv::ocl::HOGDescriptor::init_buffer(const oclMat& img, Size win_stride)
 {
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
+    if (!image_scale.empty())
+        return;
 
-    grad.create(img.size(), CV_32FC2);
+    if (effect_size == Size(0, 0))
+        effect_size = img.size();
 
+    grad.create(img.size(), CV_32FC2);
     qangle.create(img.size(), CV_8UC2);
 
+    const size_t block_hist_size = getBlockHistogramSize();
+    const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
+    block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
+
+    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
+    labels.create(1, wins_per_img.area(), CV_8U);
+}
+
+void cv::ocl::HOGDescriptor::computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle)
+{
+    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
+
     float angleScale = (float)(nbins / CV_PI);
     switch (img.type())
     {
     case CV_8UC1:
-        hog::compute_gradients_8UC1(img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
         break;
     case CV_8UC4:
-        hog::compute_gradients_8UC4(img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
         break;
     }
 }
@@ -224,14 +243,11 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat& img)
 {
     computeGradient(img, grad, qangle);
 
-    size_t block_hist_size = getBlockHistogramSize();
-    Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
-
-    block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
-
-    hog::compute_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols, grad, qangle, (float)getWinSigma(), block_hists);
+    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width, 
+        grad, qangle, (float)getWinSigma(), block_hists);
 
-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols, block_hists, (float)threshold_L2hys);
+    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width, 
+        block_hists, (float)threshold_L2hys);
 }
 
 
@@ -239,11 +255,13 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat& img, Size win_stride,
 {
     CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
 
+    init_buffer(img, win_stride);
+
     computeBlockHistograms(img);
 
     const size_t block_hist_size = getBlockHistogramSize();
     Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    Size wins_per_img   = numPartsWithin(img.size(), win_size, win_stride);
+    Size wins_per_img   = numPartsWithin(effect_size, win_size, win_stride);
 
     descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
 
@@ -251,11 +269,11 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat& img, Size win_stride,
     {
     case DESCR_FORMAT_ROW_BY_ROW:
         hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists, descriptors);
+            win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     case DESCR_FORMAT_COL_BY_COL:
         hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists, descriptors);
+            win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     default:
         CV_Error(CV_StsBadArg, "Unknown descriptor format");
@@ -272,22 +290,21 @@ void cv::ocl::HOGDescriptor::detect(const oclMat& img, vector<Point>& hits, doub
     if (detector.empty())
         return;
 
-    computeBlockHistograms(img);
-
     if (win_stride == Size())
         win_stride = block_stride;
     else
         CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+    init_buffer(img, win_stride);
 
-    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
-    labels.create(1, wins_per_img.area(), CV_8U);
+    computeBlockHistograms(img);
 
     hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                        win_stride.height, win_stride.width, img.rows, img.cols, block_hists,
+                        win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
                         detector, (float)free_coef, (float)hit_threshold, labels);
 
     labels.download(labels_host);
     unsigned char* vec = labels_host.ptr();
+    Size wins_per_img = numPartsWithin(effect_size, win_size, win_stride);
     for (int i = 0; i < wins_per_img.area(); i++)
     {
         int y = i / wins_per_img.width;
@@ -303,6 +320,7 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat& img, vector<Rect>& f
                                               Size win_stride, Size padding, double scale0, int group_threshold)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
+    CV_Assert(scale0 > 1);
 
     vector<double> level_scale;
     double scale = 1.;
@@ -318,27 +336,30 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat& img, vector<Rect>& f
     }
     levels = std::max(levels, 1);
     level_scale.resize(levels);
-    image_scales.resize(levels);
 
     std::vector<Rect> all_candidates;
     vector<Point> locations;
 
+    if (win_stride == Size())
+        win_stride = block_stride;
+    else
+        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+    init_buffer(img, win_stride);
+    image_scale.create(img.size(), img.type());
+
     for (size_t i = 0; i < level_scale.size(); i++)
     {
         scale = level_scale[i];
-        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
-        oclMat smaller_img;
-
-        if (sz == img.size())
-            smaller_img = img;
+        effect_size = Size(cvRound(img.cols / scale), cvRound(img.rows / scale));
+        if (effect_size == img.size())
+        {
+            detect(img, locations, hit_threshold, win_stride, padding);
+        }
         else
         {
-            image_scales[i].create(sz, img.type());
-            resize(img, image_scales[i], image_scales[i].size(), 0, 0, INTER_LINEAR);
-            smaller_img = image_scales[i];
+            hog::resize( img, image_scale, effect_size);
+            detect(image_scale, locations, hit_threshold, win_stride, padding);
         }
-
-        detect(smaller_img, locations, hit_threshold, win_stride, padding);
         Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
         for (size_t j = 0; j < locations.size(); j++)
             all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
@@ -1784,4 +1805,36 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
     openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
+void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
+{
+    CV_Assert( (src.channels() == dst.channels()) );
+    Context *clCxt = Context::getContext();
+
+    string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
+    size_t blkSizeX = 16, blkSizeY = 16;
+    size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
+    size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
+    size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
+    size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
+
+    float ifx = (float)src.cols / sz.width;
+    float ify = (float)src.rows / sz.height;
+
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
+    args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width));
+    args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height));
+    args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
+    args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
+
+    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
 #endif
index bf73357..6b47811 100644 (file)
@@ -67,32 +67,6 @@ __kernel void BlendLinear_C1_D0(
        }
 }
 
-__kernel void BlendLinear_C3_D0(
-       __global uchar *dst,
-       __global uchar *img1,
-       __global uchar *img2,
-       __global float *weight1,
-       __global float *weight2,
-       int rows,
-       int cols,
-       int istep,
-       int wstep
-       )
-{
-       int idx = get_global_id(0);
-       int idy = get_global_id(1);
-       int x = idx / 3;
-       int y = idy;
-       if (x < cols && y < rows)
-       {
-               int pos = idy * istep + idx;
-               int wpos = idy * (wstep /sizeof(float)) + x;
-               float w1 = weight1[wpos];
-               float w2 = weight2[wpos];
-               dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-       }
-}
-
 __kernel void BlendLinear_C4_D0(
        __global uchar *dst,
        __global uchar *img1,
@@ -143,32 +117,6 @@ __kernel void BlendLinear_C1_D5(
        }
 }
 
-__kernel void BlendLinear_C3_D5(
-       __global float *dst,
-       __global float *img1,
-       __global float *img2,
-       __global float *weight1,
-       __global float *weight2,
-       int rows,
-       int cols,
-       int istep,
-       int wstep
-       )
-{
-       int idx = get_global_id(0);
-       int idy = get_global_id(1);
-       int x = idx / 3;
-       int y = idy;
-       if (x < cols && y < rows)
-       {
-               int pos = idy * (istep / sizeof(float)) + idx;
-               int wpos = idy * (wstep /sizeof(float)) + x;
-               float w1 = weight1[wpos];
-               float w2 = weight2[wpos];
-               dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
-       }
-}
-
 __kernel void BlendLinear_C4_D5(
        __global float *dst,
        __global float *img1,
@@ -194,3 +142,4 @@ __kernel void BlendLinear_C4_D5(
                dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
        }
 }
+
index da64d15..e32e928 100644 (file)
 #ifdef L2GRAD
 inline float calc(int x, int y)
 {
-       return sqrt((float)(x * x + y * y));
+    return sqrt((float)(x * x + y * y));
 }
 #else
 inline float calc(int x, int y)
 {
-       return (float)abs(x) + abs(y);
+    return (float)abs(x) + abs(y);
 }
 #endif // 
 
@@ -70,53 +70,53 @@ inline float calc(int x, int y)
 // dx_buf      output dx buffer
 // dy_buf      output dy buffer
 __kernel 
-void calcSobelRowPass
-(
-       __global const uchar * src,
-       __global int * dx_buf,
-       __global int * dy_buf,
-       int rows,
-       int cols,
-       int src_step,
-       int src_offset,
-       int dx_buf_step,
-       int dx_buf_offset,
-       int dy_buf_step,
-       int dy_buf_offset
-)
+    void calcSobelRowPass
+    (
+    __global const uchar * src,
+    __global int * dx_buf,
+    __global int * dy_buf,
+    int rows,
+    int cols,
+    int src_step,
+    int src_offset,
+    int dx_buf_step,
+    int dx_buf_offset,
+    int dy_buf_step,
+    int dy_buf_offset
+    )
 {
-       //src_step   /= sizeof(*src);
-       //src_offset /= sizeof(*src);
-       dx_buf_step   /= sizeof(*dx_buf);
-       dx_buf_offset /= sizeof(*dx_buf);
-       dy_buf_step   /= sizeof(*dy_buf);
-       dy_buf_offset /= sizeof(*dy_buf);
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       int lidx = get_local_id(0);
-       int lidy = get_local_id(1);
-
-       __local int smem[16][18];
-
-       if(gidy < rows)
-       {
-               smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset]; 
-               if(lidx == 0)
-               {
-                       smem[lidy][0]  = src[max(gidx - 1,  0)        + gidy * src_step + src_offset];
-                       smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset]; 
-               }
-               barrier(CLK_LOCAL_MEM_FENCE);
-               if(gidx < cols)
-               {
-                       dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
-                               -smem[lidy][lidx] + smem[lidy][lidx + 2];
-                       dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
-                                smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
-               }
-       }
+    //src_step   /= sizeof(*src);
+    //src_offset /= sizeof(*src);
+    dx_buf_step   /= sizeof(*dx_buf);
+    dx_buf_offset /= sizeof(*dx_buf);
+    dy_buf_step   /= sizeof(*dy_buf);
+    dy_buf_offset /= sizeof(*dy_buf);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int smem[16][18];
+
+    if(gidy < rows)
+    {
+        smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset]; 
+        if(lidx == 0)
+        {
+            smem[lidy][0]  = src[max(gidx - 1,  0)        + gidy * src_step + src_offset];
+            smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset]; 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(gidx < cols)
+        {
+            dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
+                -smem[lidy][lidx] + smem[lidy][lidx + 2];
+            dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
+                smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+        }
+    }
 }
 
 // calculate the magnitude of the filter pass combining both x and y directions
@@ -128,72 +128,72 @@ void calcSobelRowPass
 // dy                  direvitive in y direction output
 // mag                 magnitude direvitive of xy output
 __kernel
-void calcMagnitude_buf
-(
-       __global const int * dx_buf,
-       __global const int * dy_buf,
-       __global int * dx,
-       __global int * dy,
-       __global float * mag,
-       int rows,
-       int cols,
-       int dx_buf_step,
-       int dx_buf_offset,
-       int dy_buf_step,
-       int dy_buf_offset,
-       int dx_step,
-       int dx_offset,
-       int dy_step,
-       int dy_offset,
-       int mag_step,
-       int mag_offset
-)
+    void calcMagnitude_buf
+    (
+    __global const int * dx_buf,
+    __global const int * dy_buf,
+    __global int * dx,
+    __global int * dy,
+    __global float * mag,
+    int rows,
+    int cols,
+    int dx_buf_step,
+    int dx_buf_offset,
+    int dy_buf_step,
+    int dy_buf_offset,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset
+    )
 {
-       dx_buf_step    /= sizeof(*dx_buf);
-       dx_buf_offset  /= sizeof(*dx_buf);
-       dy_buf_step    /= sizeof(*dy_buf);
-       dy_buf_offset  /= sizeof(*dy_buf);
-       dx_step    /= sizeof(*dx);
-       dx_offset  /= sizeof(*dx);
-       dy_step    /= sizeof(*dy);
-       dy_offset  /= sizeof(*dy);
-       mag_step   /= sizeof(*mag);
-       mag_offset /= sizeof(*mag);
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       int lidx = get_local_id(0);
-       int lidy = get_local_id(1);
-
-       __local int sdx[18][16];
-       __local int sdy[18][16];
-
-       if(gidx < cols)
-       {
-               sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
-               sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
-               if(lidy == 0)
-               {
-                       sdx[0][lidx]  = dx_buf[gidx + max(gidy - 1,  0)        * dx_buf_step + dx_buf_offset];
-                       sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
-
-                       sdy[0][lidx]  = dy_buf[gidx + max(gidy - 1,  0)        * dy_buf_step + dy_buf_offset];
-                       sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
-               }
-               barrier(CLK_LOCAL_MEM_FENCE);
-
-               if(gidy < rows)
-               {
-                       int x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
+    dx_buf_step    /= sizeof(*dx_buf);
+    dx_buf_offset  /= sizeof(*dx_buf);
+    dy_buf_step    /= sizeof(*dy_buf);
+    dy_buf_offset  /= sizeof(*dy_buf);
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int sdx[18][16];
+    __local int sdy[18][16];
+
+    if(gidx < cols)
+    {
+        sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
+        sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
+        if(lidy == 0)
+        {
+            sdx[0][lidx]  = dx_buf[gidx + max(gidy - 1,  0)        * dx_buf_step + dx_buf_offset];
+            sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
+
+            sdy[0][lidx]  = dy_buf[gidx + max(gidy - 1,  0)        * dy_buf_step + dy_buf_offset];
+            sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if(gidy < rows)
+        {
+            int x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
             int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
-                       
-                       dx[gidx + gidy * dx_step + dx_offset] = x;
-                       dy[gidx + gidy * dy_step + dy_offset] = y;
 
-                       mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
-               }
-       }
+            dx[gidx + gidy * dx_step + dx_offset] = x;
+            dy[gidx + gidy * dy_step + dy_offset] = y;
+
+            mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+        }
+    }
 }
 
 // calculate the magnitude of the filter pass combining both x and y directions
@@ -205,39 +205,39 @@ void calcMagnitude_buf
 // dy                  direvitive in y direction output
 // mag                 magnitude direvitive of xy output
 __kernel
-void calcMagnitude
-(
-       __global const int * dx,
-       __global const int * dy,
-       __global float * mag,
-       int rows,
-       int cols,
-       int dx_step,
-       int dx_offset,
-       int dy_step,
-       int dy_offset,
-       int mag_step,
-       int mag_offset
-)
+    void calcMagnitude
+    (
+    __global const int * dx,
+    __global const int * dy,
+    __global float * mag,
+    int rows,
+    int cols,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset
+    )
 {
-       dx_step    /= sizeof(*dx);
-       dx_offset  /= sizeof(*dx);
-       dy_step    /= sizeof(*dy);
-       dy_offset  /= sizeof(*dy);
-       mag_step   /= sizeof(*mag);
-       mag_offset /= sizeof(*mag);
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       if(gidy < rows && gidx < cols)
-       {
-               mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = 
-                       calc(
-                               dx[gidx + gidy * dx_step + dx_offset], 
-                               dy[gidx + gidy * dy_step + dy_offset]
-                       );
-       }
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = 
+            calc(
+            dx[gidx + gidy * dx_step + dx_offset], 
+            dy[gidx + gidy * dy_step + dy_offset]
+        );
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -261,284 +261,284 @@ void calcMagnitude
 // mag                 magnitudes calculated from calcMagnitude function
 // map                 output containing raw edge types
 __kernel
-void calcMap
-(
-       __global const int * dx,
-       __global const int * dy, 
-       __global const float * mag,
-       __global int * map,
-       int rows,
-       int cols,
-       float low_thresh,
-       float high_thresh,
-       int dx_step,
-       int dx_offset,
-       int dy_step,
-       int dy_offset,
-       int mag_step,
-       int mag_offset,
-       int map_step,
-       int map_offset
-)
+    void calcMap
+    (
+    __global const int * dx,
+    __global const int * dy, 
+    __global const float * mag,
+    __global int * map,
+    int rows,
+    int cols,
+    float low_thresh,
+    float high_thresh,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset,
+    int map_step,
+    int map_offset
+    )
 {
-       dx_step    /= sizeof(*dx);
-       dx_offset  /= sizeof(*dx);
-       dy_step    /= sizeof(*dy);
-       dy_offset  /= sizeof(*dy);
-       mag_step   /= sizeof(*mag);
-       mag_offset /= sizeof(*mag);
-       map_step   /= sizeof(*map);
-       map_offset /= sizeof(*map);
-
-       __local float smem[18][18];
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       int lidx = get_local_id(0);
-       int lidy = get_local_id(1);
-
-       int grp_idx = get_global_id(0) & 0xFFFFF0;
-       int grp_idy = get_global_id(1) & 0xFFFFF0;
-
-       int tid = lidx + lidy * 16;
-       int lx = tid % 18;
-       int ly = tid / 18;
-       if(ly < 14)
-       {
-               smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
-       }
-       if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
-       {
-               smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
-       }
-
-       barrier(CLK_LOCAL_MEM_FENCE);
-
-       if(gidy < rows && gidx < cols)
-       {
-               int x = dx[gidx + gidy * dx_step];
-               int y = dy[gidx + gidy * dy_step];
-               const int s = (x ^ y) < 0 ? -1 : 1;
-               const float m = smem[lidy + 1][lidx + 1];
-               x = abs(x);
-               y = abs(y);
-
-               // 0 - the pixel can not belong to an edge
-               // 1 - the pixel might belong to an edge
-               // 2 - the pixel does belong to an edge
-               int edge_type = 0;
-               if(m > low_thresh)
-               {
-                       const int tg22x = x * TG22;
-                       const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
-                       y <<= CANNY_SHIFT;
-                       if(y < tg22x)
-                       {
-                               if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-                       else if (y > tg67x)
-                       {
-                               if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-                       else
-                       {
-                               if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-               }
-               map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
-       }
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    __local float smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int x = dx[gidx + gidy * dx_step];
+        int y = dy[gidx + gidy * dy_step];
+        const int s = (x ^ y) < 0 ? -1 : 1;
+        const float m = smem[lidy + 1][lidx + 1];
+        x = abs(x);
+        y = abs(y);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        if(m > low_thresh)
+        {
+            const int tg22x = x * TG22;
+            const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+            y <<= CANNY_SHIFT;
+            if(y < tg22x)
+            {
+                if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else if (y > tg67x)
+            {
+                if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else
+            {
+                if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+        }
+        map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+    }
 }
 
 // non local memory version
 __kernel
-void calcMap_2 
-(
-       __global const int * dx,
-       __global const int * dy, 
-       __global const float * mag,
-       __global int * map,
-       int rows,
-       int cols,
-       float low_thresh,
-       float high_thresh,
-       int dx_step,
-       int dx_offset,
-       int dy_step,
-       int dy_offset,
-       int mag_step,
-       int mag_offset,
-       int map_step,
-       int map_offset
-)
+    void calcMap_2 
+    (
+    __global const int * dx,
+    __global const int * dy, 
+    __global const float * mag,
+    __global int * map,
+    int rows,
+    int cols,
+    float low_thresh,
+    float high_thresh,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset,
+    int map_step,
+    int map_offset
+    )
 {
-       dx_step    /= sizeof(*dx);
-       dx_offset  /= sizeof(*dx);
-       dy_step    /= sizeof(*dy);
-       dy_offset  /= sizeof(*dy);
-       mag_step   /= sizeof(*mag);
-       mag_offset /= sizeof(*mag);
-       map_step   /= sizeof(*map);
-       map_offset /= sizeof(*map);
-
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       if(gidy < rows && gidx < cols)
-       {
-               int x = dx[gidx + gidy * dx_step];
-               int y = dy[gidx + gidy * dy_step];
-               const int s = (x ^ y) < 0 ? -1 : 1;
-               const float m = mag[gidx + 1 + (gidy + 1) * mag_step];
-               x = abs(x);
-               y = abs(y);
-
-               // 0 - the pixel can not belong to an edge
-               // 1 - the pixel might belong to an edge
-               // 2 - the pixel does belong to an edge
-               int edge_type = 0;
-               if(m > low_thresh)
-               {
-                       const int tg22x = x * TG22;
-                       const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
-                       y <<= CANNY_SHIFT;
-                       if(y < tg22x)
-                       {
-                               if(m > mag[gidx + (gidy + 1) * mag_step] && m >= mag[gidx + 2 + (gidy + 1) * mag_step])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-                       else if (y > tg67x)
-                       {
-                               if(m > mag[gidx + 1 + gidy* mag_step] && m >= mag[gidx + 1 + (gidy + 2) * mag_step])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-                       else
-                       {
-                               if(m > mag[gidx + 1 - s + gidy * mag_step] && m > mag[gidx + 1 + s + (gidy + 2) * mag_step])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-               }
-               map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
-       }
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int x = dx[gidx + gidy * dx_step];
+        int y = dy[gidx + gidy * dy_step];
+        const int s = (x ^ y) < 0 ? -1 : 1;
+        const float m = mag[gidx + 1 + (gidy + 1) * mag_step];
+        x = abs(x);
+        y = abs(y);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        if(m > low_thresh)
+        {
+            const int tg22x = x * TG22;
+            const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+            y <<= CANNY_SHIFT;
+            if(y < tg22x)
+            {
+                if(m > mag[gidx + (gidy + 1) * mag_step] && m >= mag[gidx + 2 + (gidy + 1) * mag_step])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else if (y > tg67x)
+            {
+                if(m > mag[gidx + 1 + gidy* mag_step] && m >= mag[gidx + 1 + (gidy + 2) * mag_step])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else
+            {
+                if(m > mag[gidx + 1 - s + gidy * mag_step] && m > mag[gidx + 1 + s + (gidy + 2) * mag_step])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+        }
+        map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+    }
 }
 
 // [256, 1, 1] threaded, local memory version
 __kernel
-void calcMap_3
-(
-       __global const int * dx,
-       __global const int * dy, 
-       __global const float * mag,
-       __global int * map,
-       int rows,
-       int cols,
-       float low_thresh,
-       float high_thresh,
-       int dx_step,
-       int dx_offset,
-       int dy_step,
-       int dy_offset,
-       int mag_step,
-       int mag_offset,
-       int map_step,
-       int map_offset
-)
+    void calcMap_3
+    (
+    __global const int * dx,
+    __global const int * dy, 
+    __global const float * mag,
+    __global int * map,
+    int rows,
+    int cols,
+    float low_thresh,
+    float high_thresh,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset,
+    int map_step,
+    int map_offset
+    )
 {
-       dx_step    /= sizeof(*dx);
-       dx_offset  /= sizeof(*dx);
-       dy_step    /= sizeof(*dy);
-       dy_offset  /= sizeof(*dy);
-       mag_step   /= sizeof(*mag);
-       mag_offset /= sizeof(*mag);
-       map_step   /= sizeof(*map);
-       map_offset /= sizeof(*map);
-
-       __local float smem[18][18];
-
-       int lidx = get_local_id(0) % 16;
-       int lidy = get_local_id(0) / 16;
-       
-       int grp_pix = get_global_id(0); // identifies which pixel is processing currently in the target block
-       int grp_ind = get_global_id(1); // identifies which block of pixels is currently processing
-
-       int grp_idx = (grp_ind % (cols/16)) * 16;
-       int grp_idy = (grp_ind / (cols/16)) * 16; //(grp_ind / (cols/16)) * 16
-
-       int gidx = grp_idx + lidx;
-       int gidy = grp_idy + lidy;
-
-       int tid = get_global_id(0) % 256;
-       int lx = tid % 18;
-       int ly = tid / 18;
-       if(ly < 14)
-       {
-               smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
-       }
-       if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
-       {
-               smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
-       }
-
-       barrier(CLK_LOCAL_MEM_FENCE);
-
-       if(gidy < rows && gidx < cols)
-       {
-               int x = dx[gidx + gidy * dx_step];
-               int y = dy[gidx + gidy * dy_step];
-               const int s = (x ^ y) < 0 ? -1 : 1;
-               const float m = smem[lidy + 1][lidx + 1];
-               x = abs(x);
-               y = abs(y);
-
-               // 0 - the pixel can not belong to an edge
-               // 1 - the pixel might belong to an edge
-               // 2 - the pixel does belong to an edge
-               int edge_type = 0;
-               if(m > low_thresh)
-               {
-                       const int tg22x = x * TG22;
-                       const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
-                       y <<= CANNY_SHIFT;
-                       if(y < tg22x)
-                       {
-                               if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-                       else if (y > tg67x)
-                       {
-                               if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-                       else
-                       {
-                               if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
-                               {
-                                       edge_type = 1 + (int)(m > high_thresh);
-                               }
-                       }
-               }
-               map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
-       }
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    __local float smem[18][18];
+
+    int lidx = get_local_id(0) % 16;
+    int lidy = get_local_id(0) / 16;
+
+    int grp_pix = get_global_id(0); // identifies which pixel is processing currently in the target block
+    int grp_ind = get_global_id(1); // identifies which block of pixels is currently processing
+
+    int grp_idx = (grp_ind % (cols/16)) * 16;
+    int grp_idy = (grp_ind / (cols/16)) * 16; //(grp_ind / (cols/16)) * 16
+
+    int gidx = grp_idx + lidx;
+    int gidy = grp_idy + lidy;
+
+    int tid = get_global_id(0) % 256;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int x = dx[gidx + gidy * dx_step];
+        int y = dy[gidx + gidy * dy_step];
+        const int s = (x ^ y) < 0 ? -1 : 1;
+        const float m = smem[lidy + 1][lidx + 1];
+        x = abs(x);
+        y = abs(y);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        if(m > low_thresh)
+        {
+            const int tg22x = x * TG22;
+            const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+            y <<= CANNY_SHIFT;
+            if(y < tg22x)
+            {
+                if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else if (y > tg67x)
+            {
+                if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else
+            {
+                if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+        }
+        map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+    }
 }
 
 #undef CANNY_SHIFT
@@ -556,96 +556,96 @@ void calcMap_3
 // st          the potiential edge points found in this kernel call
 // counter     the number of potiential edge points
 __kernel
-void edgesHysteresisLocal
-(
-       __global int * map,
-       __global ushort2 * st, 
-       volatile __global unsigned int * counter,
-       int rows,
-       int cols,
-       int map_step,
-       int map_offset
-)
+    void edgesHysteresisLocal
+    (
+    __global int * map,
+    __global ushort2 * st, 
+    volatile __global unsigned int * counter,
+    int rows,
+    int cols,
+    int map_step,
+    int map_offset
+    )
 {
-       map_step   /= sizeof(*map);
-       map_offset /= sizeof(*map);
-
-       __local int smem[18][18];
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       int lidx = get_local_id(0);
-       int lidy = get_local_id(1);
-
-       int grp_idx = get_global_id(0) & 0xFFFFF0;
-       int grp_idy = get_global_id(1) & 0xFFFFF0;
-
-       int tid = lidx + lidy * 16;
-       int lx = tid % 18;
-       int ly = tid / 18;
-       if(ly < 14)
-       {
-               smem[ly][lx] = map[grp_idx + lx + (grp_idy + ly) * map_step + map_offset];
-       }
-       if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
-       {
-               smem[ly + 14][lx] = map[grp_idx + lx + (grp_idy + ly + 14) * map_step + map_offset];
-       }
-
-       barrier(CLK_LOCAL_MEM_FENCE);
-
-       if(gidy < rows && gidx < cols)
-       {
-               int n;
-
-               #pragma unroll
-               for (int k = 0; k < 16; ++k)
-               {
-                       n = 0;
-
-                       if (smem[lidy + 1][lidx + 1] == 1)
-                       {
-                               n += smem[lidy    ][lidx    ] == 2;
-                               n += smem[lidy    ][lidx + 1] == 2;
-                               n += smem[lidy    ][lidx + 2] == 2;
-
-                               n += smem[lidy + 1][lidx    ] == 2;
-                               n += smem[lidy + 1][lidx + 2] == 2;
-
-                               n += smem[lidy + 2][lidx    ] == 2;
-                               n += smem[lidy + 2][lidx + 1] == 2;
-                               n += smem[lidy + 2][lidx + 2] == 2;
-                       }
-
-                       if (n > 0)
-                               smem[lidy + 1][lidx + 1] = 2;
-               }
-
-               const int e = smem[lidy + 1][lidx + 1];
-               map[gidx + 1 + (gidy + 1) * map_step] = e;
-
-               n = 0;
-               if(e == 2)
-               {
-                       n += smem[lidy    ][lidx    ] == 1;
-                       n += smem[lidy    ][lidx + 1] == 1;
-                       n += smem[lidy    ][lidx + 2] == 1;
-
-                       n += smem[lidy + 1][lidx    ] == 1;
-                       n += smem[lidy + 1][lidx + 2] == 1;
-
-                       n += smem[lidy + 2][lidx    ] == 1;
-                       n += smem[lidy + 2][lidx + 1] == 1;
-                       n += smem[lidy + 2][lidx + 2] == 1;
-               }
-
-               if(n > 0)
-               {
-                       unsigned int ind = atomic_inc(counter);
-                       st[ind] = (ushort2)(gidx + 1, gidy + 1);
-               }
-       }
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    __local int smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] = map[grp_idx + lx + (grp_idy + ly) * map_step + map_offset];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] = map[grp_idx + lx + (grp_idy + ly + 14) * map_step + map_offset];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int n;
+
+#pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[lidy + 1][lidx + 1] == 1)
+            {
+                n += smem[lidy    ][lidx    ] == 2;
+                n += smem[lidy    ][lidx + 1] == 2;
+                n += smem[lidy    ][lidx + 2] == 2;
+
+                n += smem[lidy + 1][lidx    ] == 2;
+                n += smem[lidy + 1][lidx + 2] == 2;
+
+                n += smem[lidy + 2][lidx    ] == 2;
+                n += smem[lidy + 2][lidx + 1] == 2;
+                n += smem[lidy + 2][lidx + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[lidy + 1][lidx + 1] = 2;
+        }
+
+        const int e = smem[lidy + 1][lidx + 1];
+        map[gidx + 1 + (gidy + 1) * map_step] = e;
+
+        n = 0;
+        if(e == 2)
+        {
+            n += smem[lidy    ][lidx    ] == 1;
+            n += smem[lidy    ][lidx + 1] == 1;
+            n += smem[lidy    ][lidx + 2] == 1;
+
+            n += smem[lidy + 1][lidx    ] == 1;
+            n += smem[lidy + 1][lidx + 2] == 1;
+
+            n += smem[lidy + 2][lidx    ] == 1;
+            n += smem[lidy + 2][lidx + 1] == 1;
+            n += smem[lidy + 2][lidx + 2] == 1;
+        }
+
+        if(n > 0)
+        {
+            unsigned int ind = atomic_inc(counter);
+            st[ind] = (ushort2)(gidx + 1, gidy + 1);
+        }
+    }
 }
 
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
@@ -653,116 +653,122 @@ __constant c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
 #define stack_size 512
 __kernel
-void edgesHysteresisGlobal
-(
-       __global int * map,
-       __global ushort2 * st1, 
-       __global ushort2 * st2, 
-       volatile __global int * counter,
-       int rows,
-       int cols,
-       int count,
-       int map_step,
-       int map_offset
-)
+    void edgesHysteresisGlobal
+    (
+    __global int * map,
+    __global ushort2 * st1, 
+    __global ushort2 * st2, 
+    volatile __global int * counter,
+    int rows,
+    int cols,
+    int count,
+    int map_step,
+    int map_offset
+    )
 {
 
-       map_step   /= sizeof(*map);
-       map_offset /= sizeof(*map);
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       int lidx = get_local_id(0);
-       int lidy = get_local_id(1);
-
-       int grp_idx = get_group_id(0);
-       int grp_idy = get_group_id(1);
-
-       volatile __local unsigned int s_counter;
-       __local unsigned int s_ind;
-
-       __local ushort2 s_st[stack_size];
-
-       if(lidx == 0)
-       {
-               s_counter = 0;
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-
-       int ind = grp_idy * get_num_groups(0) + grp_idx;
-       
-       if(ind < count)
-       {
-               ushort2 pos = st1[ind];
-               if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-               {
-                       if (lidx < 8)
-                       {
-                               pos.x += c_dx[lidx];
-                               pos.y += c_dy[lidx];
-
-                               if (map[pos.x + pos.y * map_step] == 1)
-                               {
-                                       map[pos.x + pos.y * map_step] = 2;
-
-                                       ind = atomic_inc(&s_counter);
-
-                                       s_st[ind] = pos;
-                               }
-                       }
-                       barrier(CLK_LOCAL_MEM_FENCE);
-
-                       while (s_counter > 0 && s_counter <= stack_size - get_num_groups(0))
-                       {
-                               const int subTaskIdx = lidx >> 3;
-                               const int portion = min(s_counter, get_num_groups(0) >> 3);
-
-                               pos.x = pos.y = 0;
-
-                               if (subTaskIdx < portion)
-                                       pos = s_st[s_counter - 1 - subTaskIdx];
-                               barrier(CLK_LOCAL_MEM_FENCE);
-
-                               if (lidx == 0)
-                                       s_counter -= portion;
-                               barrier(CLK_LOCAL_MEM_FENCE);
-
-                               if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-                               {
-                                       pos.x += c_dx[lidx & 7];
-                                       pos.y += c_dy[lidx & 7];
-
-                                       if (map[pos.x + map_offset + pos.y * map_step] == 1)
-                                       {
-                                               map[pos.x + map_offset + pos.y * map_step] = 2;
-
-                                               ind = atomic_inc(&s_counter);
-
-                                               s_st[ind] = pos;
-                                       }
-                               }
-                               barrier(CLK_LOCAL_MEM_FENCE);
-                       }
-
-                       if (s_counter > 0)
-                       {
-                               if (lidx == 0)
-                               {
-                                       ind = atomic_add(counter, s_counter);
-                                       s_ind = ind - s_counter;
-                               }
-                               barrier(CLK_LOCAL_MEM_FENCE);
-
-                               ind = s_ind;
-
-                               for (int i = lidx; i < s_counter; i += get_num_groups(0))
-                               {
-                                       st2[ind + i] = s_st[i];
-                               }
-                       }
-               }
-       }
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_group_id(0);
+    int grp_idy = get_group_id(1);
+
+    volatile __local unsigned int s_counter;
+    __local unsigned int s_ind;
+
+    __local ushort2 s_st[stack_size];
+
+    if(gidx + gidy == 0)
+    {
+        *counter = 0;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    if(lidx == 0)
+    {
+        s_counter = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int ind = grp_idy * get_num_groups(0) + grp_idx;
+
+    if(ind < count)
+    {
+        ushort2 pos = st1[ind];
+        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+        {
+            if (lidx < 8)
+            {
+                pos.x += c_dx[lidx];
+                pos.y += c_dy[lidx];
+
+                if (map[pos.x + pos.y * map_step] == 1)
+                {
+                    map[pos.x + pos.y * map_step] = 2;
+
+                    ind = atomic_inc(&s_counter);
+
+                    s_st[ind] = pos;
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            while (s_counter > 0 && s_counter <= stack_size - get_num_groups(0))
+            {
+                const int subTaskIdx = lidx >> 3;
+                const int portion = min(s_counter, get_num_groups(0) >> 3);
+
+                pos.x = pos.y = 0;
+
+                if (subTaskIdx < portion)
+                    pos = s_st[s_counter - 1 - subTaskIdx];
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (lidx == 0)
+                    s_counter -= portion;
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                {
+                    pos.x += c_dx[lidx & 7];
+                    pos.y += c_dy[lidx & 7];
+
+                    if (map[pos.x + map_offset + pos.y * map_step] == 1)
+                    {
+                        map[pos.x + map_offset + pos.y * map_step] = 2;
+
+                        ind = atomic_inc(&s_counter);
+
+                        s_st[ind] = pos;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if (s_counter > 0)
+            {
+                if (lidx == 0)
+                {
+                    ind = atomic_add(counter, s_counter);
+                    s_ind = ind - s_counter;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                ind = s_ind;
+
+                for (int i = lidx; i < s_counter; i += get_num_groups(0))
+                {
+                    st2[ind + i] = s_st[i];
+                }
+            }
+        }
+    }
 }
 #undef stack_size
 
@@ -770,29 +776,29 @@ void edgesHysteresisGlobal
 // map         edge type mappings
 // dst         edge output
 __kernel
-void getEdges
-(
-       __global const int * map,
-       __global uchar * dst,
-       int rows,
-       int cols,
-       int map_step,
-       int map_offset,
-       int dst_step,
-       int dst_offset
-)
+    void getEdges
+    (
+    __global const int * map,
+    __global uchar * dst,
+    int rows,
+    int cols,
+    int map_step,
+    int map_offset,
+    int dst_step,
+    int dst_offset
+    )
 {
-       map_step   /= sizeof(*map);
-       map_offset /= sizeof(*map);
-       //dst_step   /= sizeof(*dst);
-       //dst_offset /= sizeof(*dst);
-
-       int gidx = get_global_id(0);
-       int gidy = get_global_id(1);
-
-       if(gidy < rows && gidx < cols)
-       {
-               //dst[gidx + gidy * dst_step] = map[gidx + 1 + (gidy + 1) * map_step] == 2 ? 255: 0;
-               dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] / 2));
-       }
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+    //dst_step   /= sizeof(*dst);
+    //dst_offset /= sizeof(*dst);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        //dst[gidx + gidy * dst_step] = map[gidx + 1 + (gidy + 1) * map_step] == 2 ? 255: 0;
+        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] / 2));
+    }
 }
index 16cd4be..ae694d5 100644 (file)
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
 
+#define FLT_EPSILON (1e-15)
 #define CV_PI_F 3.14159265f
 
-// print greyscale image to show image layout
-__kernel void printImage(image2d_t img)
-{
-    printf("(%d, %d) - %3d \n", 
-        get_global_id(0), 
-        get_global_id(1), 
-        read_imageui(img, (int2)(get_global_id(0), get_global_id(1))).x
-        );
-}
 
 // Use integral image to calculate haar wavelets.
 // N = 2
@@ -444,7 +436,6 @@ __kernel
         float val0 = N9[localLin];
         if (val0 > c_hessianThreshold)
         {
-            //printf(\"(%3d, %3d) N9[%3d]=%7.1f val0=%7.1f\\n\", l_x, l_y, localLin - zoff, N9[localLin], val0);
             // Coordinates for the start of the wavelet in the sum image. There
             // is some integer division involved, so don't try to simplify this
             // (cancel out sampleStep) without checking the result is the same
@@ -726,6 +717,7 @@ __kernel
     __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
     __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
 
+
     volatile __local  float s_X[128];
     volatile __local  float s_Y[128];
     volatile __local  float s_angle[128];
@@ -737,6 +729,7 @@ __kernel
     and building the keypoint descriptor are defined relative to 's' */
     const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
 
+
     /* To find the dominant orientation, the gradients in x and y are
     sampled in a circle of radius 6s using wavelets of size 4s.
     We ensure the gradient wavelet size is even to ensure the
@@ -765,16 +758,18 @@ __kernel
             Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
 
             angle = atan2(Y, X);
+            
             if (angle < 0)
                 angle += 2.0f * CV_PI_F;
             angle *= 180.0f / CV_PI_F;
+
         }
     }
     s_X[tid] = X;
     s_Y[tid] = Y;
     s_angle[tid] = angle;
     barrier(CLK_LOCAL_MEM_FENCE);
-
+    
     float bestx = 0, besty = 0, best_mod = 0;
 
 #pragma unroll
@@ -807,7 +802,6 @@ __kernel
             sumx += s_X[get_local_id(0) + 96];
             sumy += s_Y[get_local_id(0) + 96];
         }
-
         reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0));
         reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0));
 
@@ -818,10 +812,8 @@ __kernel
             bestx = sumx;
             besty = sumy;
         }
-
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-
     if (get_local_id(0) == 0)
     {
         s_X[get_local_id(1)] = bestx;
@@ -846,6 +838,10 @@ __kernel
             kp_dir += 2.0f * CV_PI_F;
         kp_dir *= 180.0f / CV_PI_F;
 
+        kp_dir = 360.0f - kp_dir;
+        if (fabs(kp_dir - 360.f) < FLT_EPSILON)
+            kp_dir = 0.f;
+
         featureDir[get_group_id(0)] = kp_dir;
     }
 }
@@ -940,7 +936,10 @@ void calc_dx_dy(
     const float centerX = featureX[get_group_id(0)];
     const float centerY = featureY[get_group_id(0)];
     const float size = featureSize[get_group_id(0)];
-    const float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);
+    float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
+    if (fabs(descriptor_dir - 360.f) < FLT_EPSILON)
+        descriptor_dir = 0.f;
+    descriptor_dir *= (float)(CV_PI_F / 180.0f);
 
     /* The sampling intervals and wavelet sized for selecting an orientation
     and building the keypoint descriptor are defined relative to 's' */
index 4a950fa..e1e3c1b 100644 (file)
@@ -448,3 +448,42 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
     }
 }
+
+//----------------------------------------------------------------------------
+// Resize
+
+__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
+                                 int dst_offset, int src_offset, int dst_step, int src_step, 
+                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int sx = (int)floor(dx*ifx+0.5f);
+    int sy = (int)floor(dy*ify+0.5f);
+    sx = min(sx, src_cols-1);
+    sy = min(sy, src_rows-1);
+    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
+    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
+    
+    if(dx<dst_cols && dy<dst_rows)
+        dst[dpos] = src[spos];
+}
+
+__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
+                                 int dst_offset, int src_offset, int dst_step, int src_step, 
+                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int sx = (int)floor(dx*ifx+0.5f);
+    int sy = (int)floor(dy*ify+0.5f);
+    sx = min(sx, src_cols-1);
+    sy = min(sy, src_rows-1);
+    int dpos = dst_offset + dy * dst_step + dx;
+    int spos = src_offset + sy * src_step + sx;
+    
+    if(dx<dst_cols && dy<dst_rows)
+        dst[dpos] = src[spos];
+}
\ No newline at end of file
index 4b50903..289ba12 100644 (file)
@@ -51,510 +51,458 @@ using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 
-#define EXT_FP64 0
-
 #if !defined (HAVE_OPENCL)
 void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
 #else
 //helper routines
 namespace cv
 {
-       namespace ocl
-       {
-               ///////////////////////////OpenCL kernel strings///////////////////////////
-               extern const char *match_template;
-       }
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *match_template;
+    }
 }
 
 namespace cv { namespace ocl
 {
-       void matchTemplate_SQDIFF(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
-
-       void matchTemplate_SQDIFF_NORMED(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
-
-       void matchTemplate_CCORR(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
-
-       void matchTemplate_CCORR_NORMED(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
-
-       void matchTemplate_CCOFF(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
-
-       void matchTemplate_CCOFF_NORMED(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
-
-
-       void matchTemplateNaive_SQDIFF(
-               const oclMat& image, const oclMat& templ, oclMat& result, int cn);
-
-       void matchTemplateNaive_CCORR(
-               const oclMat& image, const oclMat& templ, oclMat& result, int cn);
-
-       // Evaluates optimal template's area threshold. If 
-       // template's area is less  than the threshold, we use naive match 
-       // template version, otherwise FFT-based (if available)
-       int getTemplateThreshold(int method, int depth)
-       {
-               switch (method)
-               {
-               case CV_TM_CCORR: 
-                       if (depth == CV_32F) return 250;
-                       if (depth == CV_8U) return 300;
-                       break;
-               case CV_TM_SQDIFF:
-                       if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
-                       if (depth == CV_8U) return 300;
-                       break;
-               }
-               CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
-               return 0;
-       }
-
-
-       //////////////////////////////////////////////////////////////////////
-       // SQDIFF
-       void matchTemplate_SQDIFF(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
-       {
-               result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-               if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
-               {
-                       matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
-                       return;
-               }
-               else
-               {
-                       // TODO
-                       CV_Error(CV_StsBadArg, "Not supported yet for this size template");
-               }
-       }
-
-       void matchTemplate_SQDIFF_NORMED(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
-       {
-               matchTemplate_CCORR(image,templ,result,buf);
-               buf.image_sums.resize(1);
-               buf.image_sqsums.resize(1);
-
-               integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
-
-#if EXT_FP64 && SQRSUM_FIXED
-               unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-#else
-               Mat sqr_mat = templ.reshape(1);
-               unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
-#endif
+    void matchTemplate_SQDIFF(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+    void matchTemplate_SQDIFF_NORMED(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+    void matchTemplate_CCORR(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+    void matchTemplate_CCORR_NORMED(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+    void matchTemplate_CCOFF(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+    void matchTemplate_CCOFF_NORMED(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+
+    void matchTemplateNaive_SQDIFF(
+        const oclMat& image, const oclMat& templ, oclMat& result, int cn);
+
+    void matchTemplateNaive_CCORR(
+        const oclMat& image, const oclMat& templ, oclMat& result, int cn);
+
+    // Evaluates optimal template's area threshold. If 
+    // template's area is less  than the threshold, we use naive match 
+    // template version, otherwise FFT-based (if available)
+    int getTemplateThreshold(int method, int depth)
+    {
+        switch (method)
+        {
+        case CV_TM_CCORR: 
+            if (depth == CV_32F) return 250;
+            if (depth == CV_8U) return 300;
+            break;
+        case CV_TM_SQDIFF:
+            if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
+            if (depth == CV_8U) return 300;
+            break;
+        }
+        CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
+        return 0;
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // SQDIFF
+    void matchTemplate_SQDIFF(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+    {
+        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+        {
+            matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
+            return;
+        }
+        else
+        {
+            // TODO
+            CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+        }
+    }
+
+    void matchTemplate_SQDIFF_NORMED(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+    {
+        matchTemplate_CCORR(image,templ,result,buf);
+        buf.image_sums.resize(1);
+
+
+        integral(image.reshape(1), buf.image_sums[0]);
 
-               Context *clCxt = image.clCxt;
-               string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
-               vector< pair<size_t, const void *> > args;
-
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
-               args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
-
-               size_t globalThreads[3] = {result.cols, result.rows, 1};
-               size_t localThreads[3]  = {32, 8, 1};
-               openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
-       }
-
-       void matchTemplateNaive_SQDIFF(
-               const oclMat& image, const oclMat& templ, oclMat& result, int cn)
-       {
-               CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
-                       || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
-               CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
-               CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
-
-               Context *clCxt = image.clCxt;
-               string kernelName = "matchTemplate_Naive_SQDIFF";
-
-               vector< pair<size_t, const void *> > args;
-
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
-
-               size_t globalThreads[3] = {result.cols, result.rows, 1};
-               size_t localThreads[3]  = {32, 8, 1};
-               openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
-       }
-
-       //////////////////////////////////////////////////////////////////////
-       // CCORR
-       void matchTemplate_CCORR(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
-       {
-               result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-               if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
-               {
-                       matchTemplateNaive_CCORR(image, templ, result, image.channels());
-                       return;
-               }
-               else
-               {
-                       CV_Error(CV_StsBadArg, "Not supported yet for this size template");
-                       if(image.depth() == CV_8U && templ.depth() == CV_8U)
-                       {
-                               image.convertTo(buf.imagef, CV_32F);
-                               templ.convertTo(buf.templf, CV_32F);
-                       }
-                       CV_Assert(image.channels() == 1);
-                       oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
-                       filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
-                       result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
-               }
-       }
-
-       void matchTemplate_CCORR_NORMED(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
-       {
-               matchTemplate_CCORR(image,templ,result,buf);
-               buf.image_sums.resize(1);
-               buf.image_sqsums.resize(1);
-
-               integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
-#if EXT_FP64 && SQRSUM_FIXED
-               unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-#elif EXT_FP64
-               oclMat templ_c1 = templ.reshape(1);
-               multiply(templ_c1, templ_c1, templ_c1);
-               unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
+#if SQRSUM_FIXED
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 #else
-               Mat m_templ_c1 = templ.reshape(1);
-               multiply(m_templ_c1, m_templ_c1, m_templ_c1);
-               unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
+        Mat sqr_mat = templ.reshape(1);
+        unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
 #endif
-               Context *clCxt = image.clCxt;
-               string kernelName = "normalizeKernel";
-               vector< pair<size_t, const void *> > args;
-
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
-               args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
-
-               size_t globalThreads[3] = {result.cols, result.rows, 1};
-               size_t localThreads[3]  = {32, 8, 1};
-               openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
-       }
-
-       void matchTemplateNaive_CCORR(
-               const oclMat& image, const oclMat& templ, oclMat& result, int cn)
-       {
-               CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
-                       || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
-               CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
-               CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
-
-               Context *clCxt = image.clCxt;
-               string kernelName = "matchTemplate_Naive_CCORR";
-
-               vector< pair<size_t, const void *> > args;
-
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
-
-               size_t globalThreads[3] = {result.cols, result.rows, 1};
-               size_t localThreads[3]  = {32, 8, 1};
-               openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
-       }
-       //////////////////////////////////////////////////////////////////////
-       // CCOFF
-       void matchTemplate_CCOFF(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
-       {
-               CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
-
-               matchTemplate_CCORR(image,templ,result,buf);
-
-               Context *clCxt = image.clCxt;
-               string kernelName;
-
-               kernelName = "matchTemplate_Prepared_CCOFF";
-               size_t globalThreads[3] = {result.cols, result.rows, 1};
-               size_t localThreads[3]  = {32, 8, 1};
-
-               vector< pair<size_t, const void *> > args;
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
-               // to be continued in the following section
-               if(image.channels() == 1)
-               {
-                       buf.image_sums.resize(1);
-                       // FIXME: temp fix for incorrect integral kernel
-                       oclMat tmp_oclmat;
-                       integral(image, buf.image_sums[0], tmp_oclmat);
-
-                       float templ_sum = 0;
-#if EXT_FP64
-                       templ_sum = (float)sum(templ)[0] / templ.size().area();
+
+        Context *clCxt = image.clCxt;
+        string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
+        vector< pair<size_t, const void *> > args;
+
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+        args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+        size_t globalThreads[3] = {result.cols, result.rows, 1};
+        size_t localThreads[3]  = {32, 8, 1};
+        openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
+    }
+
+    void matchTemplateNaive_SQDIFF(
+        const oclMat& image, const oclMat& templ, oclMat& result, int cn)
+    {
+        CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
+            || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
+        CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
+        CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
+
+        Context *clCxt = image.clCxt;
+        string kernelName = "matchTemplate_Naive_SQDIFF";
+
+        vector< pair<size_t, const void *> > args;
+
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+        size_t globalThreads[3] = {result.cols, result.rows, 1};
+        size_t localThreads[3]  = {32, 8, 1};
+        openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // CCORR
+    void matchTemplate_CCORR(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+    {
+        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+        {
+            matchTemplateNaive_CCORR(image, templ, result, image.channels());
+            return;
+        }
+        else
+        {
+            CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+            if(image.depth() == CV_8U && templ.depth() == CV_8U)
+            {
+                image.convertTo(buf.imagef, CV_32F);
+                templ.convertTo(buf.templf, CV_32F);
+            }
+            CV_Assert(image.channels() == 1);
+            oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
+            filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
+            result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
+        }
+    }
+
+    void matchTemplate_CCORR_NORMED(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+    {
+        matchTemplate_CCORR(image,templ,result,buf);
+        buf.image_sums.resize(1);
+        buf.image_sqsums.resize(1);
+
+        integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
+#if SQRSUM_FIXED
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 #else
-                       Mat o_templ = templ;
-                       templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
-#endif
-                       args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                       args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                       args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                       args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
-               }
-               else
-               {
-                       Vec4f templ_sum = Vec4f::all(0);
-#if EXT_FP64
-                       split(image,buf.images);
-                       templ_sum = sum(templ) / templ.size().area();
-#else 
-                       // temp fix for non-double supported machine
-                       Mat o_templ = templ, o_image = image;
-                       vector<Mat> o_mat_vector;
-                       o_mat_vector.resize(image.channels());
-                       buf.images.resize(image.channels());
-                       split(o_image, o_mat_vector);
-                       for(int i = 0; i < o_mat_vector.size(); i ++)
-                       {
-                               buf.images[i] = oclMat(o_mat_vector[i]);
-                       }
-                       templ_sum = sum(o_templ) / templ.size().area();
+        oclMat templ_c1 = templ.reshape(1);
+        multiply(templ_c1, templ_c1, templ_c1);
+        unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
 #endif
-                       buf.image_sums.resize(buf.images.size());
-
-                       for(int i = 0; i < image.channels(); i ++)
-                       {
-                               // FIXME: temp fix for incorrect integral kernel
-                               oclMat omat_temp;
-                               integral(buf.images[i], buf.image_sums[i], omat_temp);
-                       }
-                       switch(image.channels())
-                       {
-                       case 4:
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
-                               args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                               args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
-                               break;
-                       default:
-                               CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
-                               break;
-                       }
-               }
-               openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
-       }
-
-       void matchTemplate_CCOFF_NORMED(
-               const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
-       {
-               image.convertTo(buf.imagef, CV_32F);
-               templ.convertTo(buf.templf, CV_32F);
-
-               matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
-               float scale = 1.f/templ.size().area();
-
-               Context *clCxt = image.clCxt;
-               string kernelName;
-
-               kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
-               size_t globalThreads[3] = {result.cols, result.rows, 1};
-               size_t localThreads[3]  = {32, 8, 1};
-
-               vector< pair<size_t, const void *> > args;
-               args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
-               args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
-               args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
-               args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
-               // to be continued in the following section
-               if(image.channels() == 1)
-               {
-                       buf.image_sums.resize(1);
-                       buf.image_sqsums.resize(1);
-                       integral(image, buf.image_sums[0], buf.image_sqsums[0]);
-                       float templ_sum = 0;
-                       float templ_sqsum = 0;
-#if EXT_FP64
-                       templ_sum   = (float)sum(templ)[0];
+        Context *clCxt = image.clCxt;
+        string kernelName = "normalizeKernel";
+        vector< pair<size_t, const void *> > args;
+
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+        args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+        size_t globalThreads[3] = {result.cols, result.rows, 1};
+        size_t localThreads[3]  = {32, 8, 1};
+        openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
+    }
+
+    void matchTemplateNaive_CCORR(
+        const oclMat& image, const oclMat& templ, oclMat& result, int cn)
+    {
+        CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
+            || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
+        CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
+        CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
+
+        Context *clCxt = image.clCxt;
+        string kernelName = "matchTemplate_Naive_CCORR";
+
+        vector< pair<size_t, const void *> > args;
+
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+        size_t globalThreads[3] = {result.cols, result.rows, 1};
+        size_t localThreads[3]  = {32, 8, 1};
+        openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+    }
+    //////////////////////////////////////////////////////////////////////
+    // CCOFF
+    void matchTemplate_CCOFF(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+    {
+        CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
+
+        matchTemplate_CCORR(image,templ,result,buf);
+
+        Context *clCxt = image.clCxt;
+        string kernelName;
+
+        kernelName = "matchTemplate_Prepared_CCOFF";
+        size_t globalThreads[3] = {result.cols, result.rows, 1};
+        size_t localThreads[3]  = {32, 8, 1};
+
+        vector< pair<size_t, const void *> > args;
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+        // to be continued in the following section
+        if(image.channels() == 1)
+        {
+            buf.image_sums.resize(1);
+            integral(image, buf.image_sums[0]);
+
+            float templ_sum = 0;
+            templ_sum = (float)sum(templ)[0] / templ.size().area();
+            args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+            args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
+        }
+        else
+        {
+            Vec4f templ_sum = Vec4f::all(0);
+            split(image,buf.images);
+            templ_sum = sum(templ) / templ.size().area();
+            buf.image_sums.resize(buf.images.size());
+
+            for(int i = 0; i < image.channels(); i ++)
+            {
+                integral(buf.images[i], buf.image_sums[i]);
+            }
+            switch(image.channels())
+            {
+            case 4:
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
+                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
+                break;
+            default:
+                CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
+                break;
+            }
+        }
+        openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+    }
+
+    void matchTemplate_CCOFF_NORMED(
+        const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+    {
+        image.convertTo(buf.imagef, CV_32F);
+        templ.convertTo(buf.templf, CV_32F);
+
+        matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
+        float scale = 1.f/templ.size().area();
+
+        Context *clCxt = image.clCxt;
+        string kernelName;
+
+        kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
+        size_t globalThreads[3] = {result.cols, result.rows, 1};
+        size_t localThreads[3]  = {32, 8, 1};
+
+        vector< pair<size_t, const void *> > args;
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
+        args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+        args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
+        // to be continued in the following section
+        if(image.channels() == 1)
+        {
+            buf.image_sums.resize(1);
+            buf.image_sqsums.resize(1);
+            integral(image, buf.image_sums[0], buf.image_sqsums[0]);
+            float templ_sum = 0;
+            float templ_sqsum = 0;
+            templ_sum   = (float)sum(templ)[0];
 #if SQRSUM_FIXED
-                       templ_sqsum = sqrSum(templ);
+            templ_sqsum = sqrSum(templ)[0];
 #else
-                       oclMat templ_sqr = templ;
-                       multiply(templ,templ, templ_sqr);
-                       templ_sqsum  = sum(templ_sqr)[0];
+            oclMat templ_sqr = templ;
+            multiply(templ,templ, templ_sqr);
+            templ_sqsum  = sum(templ_sqr)[0];
 #endif //SQRSUM_FIXED
-                       templ_sqsum -= scale * templ_sum * templ_sum;
-                       templ_sum   *= scale;
-#else
-                       // temp fix for non-double supported machine
-                       Mat o_templ = templ;
-                       templ_sum   = (float)sum(o_templ)[0]; 
-                       templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
-                       templ_sum  *= scale;
-#endif
-                       args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                       args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                       args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                       args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
-                       args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
-                       args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
-                       args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
-                       args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
-               }
-               else
-               {
-                       Vec4f templ_sum   = Vec4f::all(0);
-                       Vec4f templ_sqsum = Vec4f::all(0);
-#if EXT_FP64
-                       split(image,buf.images);
-                       templ_sum   = sum(templ);
+            templ_sqsum -= scale * templ_sum * templ_sum;
+            templ_sum   *= scale;
+
+            args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+            args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
+            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
+            args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
+            args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
+            args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
+        }
+        else
+        {
+            Vec4f templ_sum   = Vec4f::all(0);
+            Vec4f templ_sqsum = Vec4f::all(0);
+
+            split(image,buf.images);
+            templ_sum   = sum(templ);
 #if SQRSUM_FIXED
-                       templ_sqsum = sqrSum(templ);
+            templ_sqsum = sqrSum(templ);
 #else
-                       oclMat templ_sqr = templ;
-                       multiply(templ,templ, templ_sqr);
-                       templ_sqsum  = sum(templ_sqr);
+            oclMat templ_sqr = templ;
+            multiply(templ,templ, templ_sqr);
+            templ_sqsum  = sum(templ_sqr);
 #endif //SQRSUM_FIXED
-                       templ_sqsum -= scale * templ_sum * templ_sum;
-                       
-#else 
-                       // temp fix for non-double supported machine
-                       Mat o_templ = templ, o_image = image;
-                       
-                       vector<Mat> o_mat_vector;
-                       o_mat_vector.resize(image.channels());
-                       buf.images.resize(image.channels());
-                       split(o_image, o_mat_vector);
-                       for(int i = 0; i < o_mat_vector.size(); i ++)
-                       {
-                               buf.images[i] = oclMat(o_mat_vector[i]);
-                       }
-                       templ_sum    = sum(o_templ);
-                       templ_sqsum  = sum(o_templ.mul(o_templ));
-#endif
-                       float templ_sqsum_sum = 0;
-                       for(int i = 0; i < image.channels(); i ++)
-                       {
-                               templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
-                       }
-                       templ_sum   *= scale;
-                       buf.image_sums.resize(buf.images.size());
-                       buf.image_sqsums.resize(buf.images.size());
-                       
-                       for(int i = 0; i < image.channels(); i ++)
-                       {
-                               integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
-                       }
-                       
-                       switch(image.channels())
-                       {
-                       case 4:
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
-                               args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                               args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
-                               args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
-                               args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
-                               args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
-                               args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
-                               break;
-                       default:
-                               CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
-                               break;
-                       }
-               }
-               openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
-       }
+            templ_sqsum -= scale * templ_sum * templ_sum;
+
+            float templ_sqsum_sum = 0;
+            for(int i = 0; i < image.channels(); i ++)
+            {
+                templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
+            }
+            templ_sum   *= scale;
+            buf.image_sums.resize(buf.images.size());
+            buf.image_sqsums.resize(buf.images.size());
+
+            for(int i = 0; i < image.channels(); i ++)
+            {
+                integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
+            }
+
+            switch(image.channels())
+            {
+            case 4:
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
+                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
+                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
+                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
+                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
+                args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
+                break;
+            default:
+                CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
+                break;
+            }
+        }
+        openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+    }
 
 }/*ocl*/} /*cv*/
 
 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
 {
-       MatchTemplateBuf buf;
-       matchTemplate(image,templ, result, method,buf);
+    MatchTemplateBuf buf;
+    matchTemplate(image,templ, result, method,buf);
 }
 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf)
 {
-       CV_Assert(image.type() == templ.type());
-       CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
+    CV_Assert(image.type() == templ.type());
+    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 
-       typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
+    typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
 
-       const Caller callers[] = { 
-               ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, 
-               ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, 
-               ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
-       };
+    const Caller callers[] = { 
+        ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, 
+        ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, 
+        ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
+    };
 
-       Caller caller = callers[method];
-       CV_Assert(caller);
-       caller(image, templ, result, buf);
+    Caller caller = callers[method];
+    CV_Assert(caller);
+    caller(image, templ, result, buf);
 }
 #endif //
index 3f0a241..058d543 100644 (file)
@@ -1,4 +1,49 @@
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//             Dachuan Zhao, dachuan@multicorewareinc.com
+//             Yao Wang, yao@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
 #include "precomp.hpp"
 
 using namespace cv;
@@ -24,7 +69,6 @@ namespace cv
 template<typename T>
 void pyrdown_run(const oclMat &src, const oclMat &dst)
 {
-    CV_Assert(src.cols / 2 == dst.cols && src.rows / 2 == dst.rows);
 
     CV_Assert(src.type() == dst.type());
     CV_Assert(src.depth() != CV_8S);
@@ -108,7 +152,7 @@ void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
 
     dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
 
-       //dst.step = dst.rows;
+       dst.download_channels = src.download_channels;
 
     pyrdown_run(src, dst);
 }
index ee0dfe3..0190faa 100644 (file)
@@ -16,6 +16,7 @@
 //
 // @Authors
 //             Zhang Chunpeng chunpeng@multicorewareinc.com
+//             Yao Wang, yao@multicorewareinc.com
 //    
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -61,8 +62,9 @@ namespace cv { namespace ocl
 { 
        extern const char *pyr_up;
        void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst)
-       {
+       {               
                dst.create(src.rows * 2, src.cols * 2, src.type());
+               dst.download_channels=src.download_channels;
                Context *clCxt = src.clCxt;
                
                const std::string kernelName = "pyrUp";
index 7d9798d..af744ff 100644 (file)
@@ -149,8 +149,7 @@ namespace
             //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
 
             bindImgTex(img);
-            oclMat integral_sqsum;
-            integral(img, surf_.sum, integral_sqsum); // the two argumented integral version is incorrect
+            integral(img, surf_.sum); // the two argumented integral version is incorrect
 
             bindSumTex(surf_.sum);
             maskSumTex = 0;
index c5cbee7..abe113e 100644 (file)
@@ -74,12 +74,10 @@ PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
 TEST_P(ColumnSum, Accuracy)
 {
     cv::Mat src = randomMat(size, CV_32FC1);
-       //cv::Mat src(size,CV_32FC1);
+       cv::ocl::oclMat d_dst;
+       cv::ocl::oclMat d_src(src);     
 
-       //cv::ocl::oclMat d_dst = ::createMat(size,src.type(),useRoi);
-       cv::ocl::oclMat d_dst = loadMat(src,useRoi);
-
-    cv::ocl::columnSum(loadMat(src,useRoi),d_dst);
+    cv::ocl::columnSum(d_src,d_dst);
 
     cv::Mat dst(d_dst);
 
index b19db39..ede1a30 100644 (file)
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Dachuan Zhao, dachuan@multicorewareinc.com
+//    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,9 +44,6 @@
 //
 //M*/
 
-//#define PRINT_CPU_TIME 1000
-//#define PRINT_TIME
-
 
 #include "precomp.hpp"
 #include <iomanip>
@@ -58,66 +56,15 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-PARAM_TEST_CASE(PyrDown, MatType, bool)
+PARAM_TEST_CASE(PyrDown, MatType, int)
 {
-    int type;
-    cv::Scalar val;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
+       int type;
+       int channels;
 
     virtual void SetUp()
     {
         type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+               channels = GET_PARAM(1);
 
         //int devnums = getDevice(oclinfo);
         //CV_Assert(devnums > 0);
@@ -127,169 +74,36 @@ PARAM_TEST_CASE(PyrDown, MatType, bool)
 
        void Cleanup()
        {
-               mat1.release();
-               mat2.release();
-               mask.release();
-               dst.release();
-               dst1.release();
-               mat1_roi.release();
-               mat2_roi.release();
-               mask_roi.release();
-               dst_roi.release();
-               dst1_roi.release();
-
-               gdst_whole.release();
-               gdst1_whole.release();
-               gmat1.release();
-               gmat2.release();
-               gdst.release();
-               gdst1.release();
-               gmask.release();
        }
 
-    void random_roi()
-    {
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-#ifdef RANDOMROI
-        //randomize ROI
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-#else
-        roicols = mat1.cols;
-        roirows = mat1.rows;
-        src1x = 0;
-        src1y = 0;
-        dstx = 0;
-        dsty = 0;
-#endif
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi; //end
-    }
-
 };
 
-#define VARNAME(A) string(#A);
-
-
-void PrePrint()
-{
-               //for(int i = 0; i < MHEIGHT; i++)
-               //{
-               //      printf("(%d) ", i);
-               //      for(int k = 0; k < MWIDTH; k++)
-               //      {
-               //              printf("%d ", mat1_roi.data[i * MHEIGHT + k]);
-               //      }
-               //      printf("\n");
-               //}
-}
-
-void PostPrint()
-{
-               //dst_roi.convertTo(dst_roi,CV_32S);
-               //cpu_dst.convertTo(cpu_dst,CV_32S);
-               //dst_roi -= cpu_dst;
-               //cpu_dst -= dst_roi;
-               //for(int i = 0; i < MHEIGHT / 2; i++)
-               //{
-               //      printf("(%d) ", i);
-               //      for(int k = 0; k < MWIDTH / 2; k++)
-               //      {
-               //              if(gmat1.depth() == 0)
-               //              {
-               //                      if(gmat1.channels() == 1)
-               //                      {
-               //                              printf("%d ", dst_roi.data[i * MHEIGHT / 2 + k]);
-               //                      }
-               //                      else
-               //                      {
-               //                              printf("%d ", ((unsigned*)dst_roi.data)[i * MHEIGHT / 2 + k]);
-               //                      }
-               //              }
-               //              else if(gmat1.depth() == 5)
-               //              {
-               //                      printf("%.6f ", ((float*)dst_roi.data)[i * MHEIGHT / 2 + k]);
-               //              }
-               //      }
-               //      printf("\n");
-               //}
-               //for(int i = 0; i < MHEIGHT / 2; i++)
-               //{
-               //      printf("(%d) ", i);
-               //      for(int k = 0; k < MWIDTH / 2; k++)
-               //      {
-               //              if(gmat1.depth() == 0)
-               //              {
-               //                      if(gmat1.channels() == 1)
-               //                      {
-               //                              printf("%d ", cpu_dst.data[i * MHEIGHT / 2 + k]);
-               //                      }
-               //                      else
-               //                      {
-               //                              printf("%d ", ((unsigned*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
-               //                      }
-               //              }
-               //              else if(gmat1.depth() == 5)
-               //              {
-               //                      printf("%.6f ", ((float*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
-               //              }
-               //      }
-               //      printf("\n");
-               //}
-}
-
-////////////////////////////////PyrDown/////////////////////////////////////////////////
-//struct PyrDown : ArithmTestBase {};
 
 TEST_P(PyrDown, Mat)
 {
     for(int j = 0; j < LOOP_TIMES; j++)
     {
-        random_roi();
+        cv::Size size(MWIDTH, MHEIGHT);
+               cv::RNG &rng = TS::ptr()->get_rng();
+               cv::Mat src=randomMat(rng, size, CV_MAKETYPE(type, channels), 0, 100, false);
 
-               cv::pyrDown(mat1_roi, dst_roi);
-               cv::ocl::pyrDown(gmat1, gdst);
+               cv::ocl::oclMat gsrc(src), gdst;
+               cv::Mat dst_cpu;
+               cv::pyrDown(src, dst_cpu);
+               cv::ocl::pyrDown(gsrc, gdst);
 
-        cv::Mat cpu_dst;
-        gdst.download(cpu_dst);
-        char s[1024];
-        sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
+        cv::Mat dst;
+        gdst.download(dst);
+               char s[1024]={0};
 
-               EXPECT_MAT_NEAR(dst_roi, cpu_dst, dst_roi.depth() == CV_32F ? 1e-5f : 1.0f, s);
+               EXPECT_MAT_NEAR(dst, dst_cpu, dst.depth() == CV_32F ? 1e-4f : 1.0f, s);
 
                Cleanup();
     }
 }
 
-
-
-
-//********test****************
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
 
 
 #endif // HAVE_OPENCL
index 6bc043e..c50aeb5 100644 (file)
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Zhang Chunpeng chunpeng@multicorewareinc.com
+//    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 
 #ifdef HAVE_OPENCL
 
+using namespace cv;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
 
-PARAM_TEST_CASE(PyrUp,cv::Size,int)
+PARAM_TEST_CASE(PyrUp, MatType, int)
 {
-       cv::Size size;
        int type;
+       int channels;
        //std::vector<cv::ocl::Info> oclinfo;
 
        virtual void SetUp()
        {
                //int devnums = cv::ocl::getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
                //CV_Assert(devnums > 0);
-               size = GET_PARAM(0);
-               type = GET_PARAM(1);
+               type = GET_PARAM(0);
+               channels = GET_PARAM(1);
        }
 };
 
 TEST_P(PyrUp,Accuracy)
 {
-       cv::Mat src = randomMat(size,type);
-       
-
-       cv::Mat dst_gold;
-       cv::pyrUp(src,dst_gold);
-
-       cv::ocl::oclMat dst;
-       cv::ocl::oclMat srcMat(src);
-       cv::ocl::pyrUp(srcMat,dst);
-       char s[100]={0};
+       for(int j = 0; j < LOOP_TIMES; j++)
+    {
+               Size size(MWIDTH, MHEIGHT);
+               Mat src = randomMat(size,CV_MAKETYPE(type, channels));  
+               Mat dst_gold;
+               pyrUp(src,dst_gold);
+               ocl::oclMat dst;
+               ocl::oclMat srcMat(src);
+               ocl::pyrUp(srcMat,dst);
+               Mat cpu_dst;
+               dst.download(cpu_dst);
+               char s[100]={0};
 
-       EXPECT_MAT_NEAR(dst_gold, dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);        
+               EXPECT_MAT_NEAR(dst_gold, cpu_dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);    
+       }
        
 }
 
-#if 1
+
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
-    testing::Values(cv::Size(32, 32)),
-    testing::Values(MatType(CV_8UC1),MatType(CV_16UC1),MatType(CV_32FC1),MatType(CV_8UC4),
-       MatType(CV_16UC4),MatType(CV_32FC4))));
-#endif
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
+
 
 #endif // HAVE_OPENCL
\ No newline at end of file