moved GpuMat and DevMem2D to core module, some code refactoring

author Vladislav Vinogradov <no@email>

Wed, 9 Nov 2011 13:13:52 +0000 (13:13 +0000)

committer Vladislav Vinogradov <no@email>

Wed, 9 Nov 2011 13:13:52 +0000 (13:13 +0000)
author Vladislav Vinogradov <no@email>
Wed, 9 Nov 2011 13:13:52 +0000 (13:13 +0000)
committer Vladislav Vinogradov <no@email>
Wed, 9 Nov 2011 13:13:52 +0000 (13:13 +0000)
diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp

index b618712..d7d98d0 100644 (file)
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -90,6 +90,10 @@ class Mat;
  class SparseMat;
  typedef Mat MatND;
  
+namespace gpu {
+    class GpuMat;
+}
+
  class CV_EXPORTS MatExpr;
  class CV_EXPORTS MatOp_Base;
  class CV_EXPORTS MatArg;
@@ -1627,6 +1631,10 @@ public:
      template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true);
      //! builds matrix from comma initializer
      template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    //! download data from GpuMat
+    explicit Mat(const gpu::GpuMat& m);
+
      //! destructor - calls release()
      ~Mat();
      //! assignment operators
diff --git a/modules/core/include/opencv2/core/devmem2d.hpp b/modules/core/include/opencv2/core/devmem2d.hpp

new file mode 100644 (file)

index 0000000..6ab70c5
--- /dev/null
+++ b/modules/core/include/opencv2/core/devmem2d.hpp
@@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other GpuMaterials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#ifndef __OPENCV_CORE_DevMem2D_HPP__\r
+#define __OPENCV_CORE_DevMem2D_HPP__\r
+\r
+#ifdef __CUDACC__ \r
+    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ \r
+#else\r
+    #define __CV_GPU_HOST_DEVICE__\r
+#endif\r
+\r
+namespace cv\r
+{    \r
+    namespace gpu\r
+    {\r
+        // Simple lightweight structures that encapsulates information about an image on device.\r
+        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile\r
+\r
+        template <bool expr> struct StaticAssert;\r
+        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        \r
+\r
+               template<typename T> struct DevPtr\r
+               {\r
+                       typedef T elem_type;\r
+                       typedef int index_type;\r
+\r
+                       enum { elem_size = sizeof(elem_type) };\r
+\r
+                       T* data;\r
+\r
+                       __CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}\r
+                       __CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}\r
+\r
+                       __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }\r
+                       __CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }\r
+                       __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }\r
+               };\r
+               \r
+               template<typename T> struct PtrSz : public DevPtr<T>\r
+        {                     \r
+            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}\r
+            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}\r
+\r
+            size_t size;\r
+        };\r
+\r
+               template<typename T> struct PtrStep : public DevPtr<T>\r
+        {   \r
+            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}\r
+                       __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}\r
+\r
+            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */\r
+            size_t step;            \r
+\r
+                       __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }\r
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }\r
+\r
+                       __CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }\r
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }\r
+        };\r
+\r
+               template <typename T> struct PtrStepSz : public PtrStep<T>\r
+        {   \r
+            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}\r
+            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) \r
+                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}\r
+\r
+            int cols;\r
+            int rows;                                                                              \r
+        };\r
+\r
+               template <typename T> struct DevMem2D_ : public PtrStepSz<T>\r
+        {            \r
+            DevMem2D_() {}\r
+                       DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}\r
+                            \r
+            template <typename U>            \r
+                       explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}                                                                \r
+        };\r
+                              \r
+        template<typename T> struct PtrElemStep_ : public PtrStep<T>\r
+        {                   \r
+            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) \r
+            {\r
+                StaticAssert<256 % sizeof(T) == 0>::check();\r
+\r
+                PtrStep<T>::step /= PtrStep<T>::elem_size;             \r
+            }\r
+            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }\r
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }  \r
+\r
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }\r
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  \r
+        };\r
+\r
+               template<typename T> struct PtrStep_ : public PtrStep<T>\r
+        {            \r
+            PtrStep_() {}\r
+            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}                        \r
+        };\r
+\r
+        typedef DevMem2D_<unsigned char> DevMem2Db;\r
+               typedef DevMem2Db DevMem2D;\r
+        typedef DevMem2D_<float> DevMem2Df;\r
+        typedef DevMem2D_<int> DevMem2Di;\r
+\r
+        typedef PtrStep<unsigned char> PtrStepb;\r
+        typedef PtrStep<float> PtrStepf;\r
+        typedef PtrStep<int> PtrStepi;\r
+\r
+        typedef PtrElemStep_<unsigned char> PtrElemStep;\r
+        typedef PtrElemStep_<float> PtrElemStepf;\r
+        typedef PtrElemStep_<int> PtrElemStepi;                \r
+    }    \r
+}\r
+\r
+#endif /* __OPENCV_GPU_DevMem2D_HPP__ */\r
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp

new file mode 100644 (file)

index 0000000..accfb7c
--- /dev/null
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -0,0 +1,471 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other GpuMaterials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#ifndef __OPENCV_GPUMAT_HPP__\r
+#define __OPENCV_GPUMAT_HPP__\r
+\r
+#include "opencv2/core/core.hpp"\r
+#include "opencv2/core/devmem2d.hpp"\r
+\r
+namespace cv { namespace gpu\r
+{\r
+    //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.\r
+    class CV_EXPORTS GpuMat\r
+    {\r
+    public:\r
+        //! default constructor\r
+        GpuMat();\r
+\r
+        //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)\r
+        GpuMat(int rows, int cols, int type);\r
+        GpuMat(Size size, int type);\r
+\r
+        //! constucts GpuMatrix and fills it with the specified value _s.\r
+        GpuMat(int rows, int cols, int type, Scalar s);\r
+        GpuMat(Size size, int type, Scalar s);\r
+\r
+        //! copy constructor\r
+        GpuMat(const GpuMat& m);\r
+\r
+        //! constructor for GpuMatrix headers pointing to user-allocated data\r
+        GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);\r
+        GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);\r
+\r
+        //! creates a matrix header for a part of the bigger matrix\r
+        GpuMat(const GpuMat& m, Range rowRange, Range colRange);\r
+        GpuMat(const GpuMat& m, Rect roi);\r
+        \r
+        //! builds GpuMat from Mat. Perfom blocking upload to device.\r
+        explicit GpuMat(const Mat& m);\r
+\r
+        //! destructor - calls release()\r
+        ~GpuMat();\r
+\r
+        //! assignment operators\r
+        GpuMat& operator = (const GpuMat& m);\r
+        \r
+        //! pefroms blocking upload data to GpuMat.\r
+        void upload(const Mat& m);\r
+\r
+        //! downloads data from device to host memory. Blocking calls.\r
+        void download(Mat& m) const;\r
+\r
+        //! returns a new GpuMatrix header for the specified row\r
+        GpuMat row(int y) const;\r
+        //! returns a new GpuMatrix header for the specified column\r
+        GpuMat col(int x) const;\r
+        //! ... for the specified row span\r
+        GpuMat rowRange(int startrow, int endrow) const;\r
+        GpuMat rowRange(Range r) const;\r
+        //! ... for the specified column span\r
+        GpuMat colRange(int startcol, int endcol) const;\r
+        GpuMat colRange(Range r) const;\r
+\r
+        //! returns deep copy of the GpuMatrix, i.e. the data is copied\r
+        GpuMat clone() const;\r
+        //! copies the GpuMatrix content to "m".\r
+        // It calls m.create(this->size(), this->type()).\r
+        void copyTo(GpuMat& m) const;\r
+        //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.\r
+        void copyTo(GpuMat& m, const GpuMat& mask) const;\r
+        //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.\r
+        void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;\r
+\r
+        void assignTo(GpuMat& m, int type=-1) const;\r
+\r
+        //! sets every GpuMatrix element to s\r
+        GpuMat& operator = (Scalar s);\r
+        //! sets some of the GpuMatrix elements to s, according to the mask\r
+        GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat());\r
+        //! creates alternative GpuMatrix header for the same data, with different\r
+        // number of channels and/or different number of rows. see cvReshape.\r
+        GpuMat reshape(int cn, int rows = 0) const;\r
+\r
+        //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.\r
+        // previous data is unreferenced if needed.\r
+        void create(int rows, int cols, int type);\r
+        void create(Size size, int type);\r
+        //! decreases reference counter;\r
+        // deallocate the data when reference counter reaches 0.\r
+        void release();\r
+\r
+        //! swaps with other smart pointer\r
+        void swap(GpuMat& mat);\r
+\r
+        //! locates GpuMatrix header within a parent GpuMatrix. See below\r
+        void locateROI(Size& wholeSize, Point& ofs) const;\r
+        //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.\r
+        GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);\r
+        //! extracts a rectangular sub-GpuMatrix\r
+        // (this is a generalized form of row, rowRange etc.)\r
+        GpuMat operator()(Range rowRange, Range colRange) const;\r
+        GpuMat operator()(Rect roi) const;\r
+\r
+        //! returns true iff the GpuMatrix data is continuous\r
+        // (i.e. when there are no gaps between successive rows).\r
+        // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)\r
+        bool isContinuous() const;\r
+        //! returns element size in bytes,\r
+        // similar to CV_ELEM_SIZE(cvMat->type)\r
+        size_t elemSize() const;\r
+        //! returns the size of element channel in bytes.\r
+        size_t elemSize1() const;\r
+        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)\r
+        int type() const;\r
+        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)\r
+        int depth() const;\r
+        //! returns element type, similar to CV_MAT_CN(cvMat->type)\r
+        int channels() const;\r
+        //! returns step/elemSize1()\r
+        size_t step1() const;\r
+        //! returns GpuMatrix size:\r
+        // width == number of columns, height == number of rows\r
+        Size size() const;\r
+        //! returns true if GpuMatrix data is NULL\r
+        bool empty() const;\r
+\r
+        //! returns pointer to y-th row\r
+        uchar* ptr(int y = 0);\r
+        const uchar* ptr(int y = 0) const;\r
+\r
+        //! template version of the above method\r
+        template<typename _Tp> _Tp* ptr(int y = 0);\r
+        template<typename _Tp> const _Tp* ptr(int y = 0) const;\r
+\r
+        template <typename _Tp> operator DevMem2D_<_Tp>() const;\r
+        template <typename _Tp> operator PtrStep_<_Tp>() const;\r
+\r
+        /*! includes several bit-fields:\r
+        - the magic signature\r
+        - continuity flag\r
+        - depth\r
+        - number of channels\r
+        */\r
+        int flags;\r
+\r
+        //! the number of rows and columns\r
+        int rows, cols;\r
+\r
+        //! a distance between successive rows in bytes; includes the gap if any\r
+        size_t step;\r
+\r
+        //! pointer to the data\r
+        uchar* data;\r
+\r
+        //! pointer to the reference counter;\r
+        // when GpuMatrix points to user-allocated data, the pointer is NULL\r
+        int* refcount;\r
+\r
+        //! helper fields used in locateROI and adjustROI\r
+        uchar* datastart;\r
+        uchar* dataend;\r
+    };\r
+\r
+    //! Creates continuous GPU matrix\r
+    CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);\r
+    CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);\r
+    CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);\r
+    CV_EXPORTS GpuMat createContinuous(Size size, int type);\r
+\r
+    //! Ensures that size of the given matrix is not less than (rows, cols) size\r
+    //! and matrix type is match specified one too\r
+    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);\r
+    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);\r
+\r
+    class CV_EXPORTS GpuFuncTable\r
+    {\r
+    public:\r
+        virtual ~GpuFuncTable() {}\r
+\r
+        virtual void copy(const Mat& src, GpuMat& dst) const = 0;\r
+        virtual void copy(const GpuMat& src, Mat& dst) const = 0;\r
+        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;\r
+\r
+        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;\r
+\r
+        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;\r
+        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;\r
+\r
+        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;\r
+\r
+        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;\r
+        virtual void free(void* devPtr) const = 0;\r
+    };\r
+\r
+    CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl);\r
+\r
+    ////////////////////////////////////////////////////////////////////////\r
+\r
+    inline GpuMat::GpuMat() \r
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) \r
+    {\r
+    }\r
+\r
+    inline GpuMat::GpuMat(int rows_, int cols_, int type_) \r
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
+    {\r
+        if (rows_ > 0 && cols_ > 0)\r
+            create(rows_, cols_, type_);\r
+    }\r
+\r
+    inline GpuMat::GpuMat(Size size_, int type_) \r
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
+    {\r
+        if (size_.height > 0 && size_.width > 0)\r
+            create(size_.height, size_.width, type_);\r
+    }\r
+\r
+    inline GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_) \r
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
+    {\r
+        if (rows_ > 0 && cols_ > 0)\r
+        {\r
+            create(rows_, cols_, type_);\r
+            setTo(s_);\r
+        }\r
+    }\r
+\r
+    inline GpuMat::GpuMat(Size size_, int type_, Scalar s_) \r
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
+    {\r
+        if (size_.height > 0 && size_.width > 0)\r
+        {\r
+            create(size_.height, size_.width, type_);\r
+            setTo(s_);\r
+        }\r
+    }    \r
+\r
+    inline GpuMat::~GpuMat() \r
+    { \r
+        release(); \r
+    }\r
+\r
+    inline GpuMat GpuMat::clone() const\r
+    {\r
+        GpuMat m;\r
+        copyTo(m);\r
+        return m;\r
+    }\r
+\r
+    inline void GpuMat::assignTo(GpuMat& m, int type) const\r
+    {\r
+        if (type < 0)\r
+            m = *this;\r
+        else\r
+            convertTo(m, type);\r
+    }\r
+\r
+    inline size_t GpuMat::step1() const \r
+    { \r
+        return step / elemSize1(); \r
+    }\r
+\r
+    inline bool GpuMat::empty() const \r
+    { \r
+        return data == 0; \r
+    }\r
+\r
+    template<typename _Tp> inline _Tp* GpuMat::ptr(int y)\r
+    {\r
+        return (_Tp*)ptr(y);\r
+    }\r
+\r
+    template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const\r
+    {\r
+        return (const _Tp*)ptr(y);\r
+    }\r
+\r
+    inline void swap(GpuMat& a, GpuMat& b) \r
+    { \r
+        a.swap(b); \r
+    }\r
+\r
+    inline GpuMat GpuMat::row(int y) const \r
+    { \r
+        return GpuMat(*this, Range(y, y+1), Range::all()); \r
+    }\r
+\r
+    inline GpuMat GpuMat::col(int x) const \r
+    { \r
+        return GpuMat(*this, Range::all(), Range(x, x+1)); \r
+    }\r
+\r
+    inline GpuMat GpuMat::rowRange(int startrow, int endrow) const \r
+    { \r
+        return GpuMat(*this, Range(startrow, endrow), Range::all()); \r
+    }\r
+\r
+    inline GpuMat GpuMat::rowRange(Range r) const \r
+    { \r
+        return GpuMat(*this, r, Range::all()); \r
+    }\r
+\r
+    inline GpuMat GpuMat::colRange(int startcol, int endcol) const \r
+    { \r
+        return GpuMat(*this, Range::all(), Range(startcol, endcol)); \r
+    }\r
+\r
+    inline GpuMat GpuMat::colRange(Range r) const \r
+    { \r
+        return GpuMat(*this, Range::all(), r); \r
+    }\r
+\r
+    inline void GpuMat::create(Size size_, int type_) \r
+    { \r
+        create(size_.height, size_.width, type_); \r
+    }\r
+\r
+    inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const \r
+    { \r
+        return GpuMat(*this, rowRange, colRange); \r
+    }\r
+\r
+    inline GpuMat GpuMat::operator()(Rect roi) const \r
+    { \r
+        return GpuMat(*this, roi); \r
+    }\r
+\r
+    inline bool GpuMat::isContinuous() const \r
+    { \r
+        return (flags & Mat::CONTINUOUS_FLAG) != 0; \r
+    }\r
+\r
+    inline size_t GpuMat::elemSize() const \r
+    { \r
+        return CV_ELEM_SIZE(flags); \r
+    }\r
+\r
+    inline size_t GpuMat::elemSize1() const \r
+    { \r
+        return CV_ELEM_SIZE1(flags); \r
+    }\r
+\r
+    inline int GpuMat::type() const \r
+    { \r
+        return CV_MAT_TYPE(flags); \r
+    }\r
+\r
+    inline int GpuMat::depth() const \r
+    { \r
+        return CV_MAT_DEPTH(flags); \r
+    }\r
+\r
+    inline int GpuMat::channels() const \r
+    { \r
+        return CV_MAT_CN(flags); \r
+    }\r
+\r
+    inline Size GpuMat::size() const \r
+    { \r
+        return Size(cols, rows); \r
+    }\r
+\r
+    inline uchar* GpuMat::ptr(int y)\r
+    {\r
+        CV_DbgAssert((unsigned)y < (unsigned)rows);\r
+        return data + step * y;\r
+    }\r
+\r
+    inline const uchar* GpuMat::ptr(int y) const\r
+    {\r
+        CV_DbgAssert((unsigned)y < (unsigned)rows);\r
+        return data + step * y;\r
+    }\r
+\r
+    inline GpuMat& GpuMat::operator = (Scalar s)\r
+    {\r
+        setTo(s);\r
+        return *this;\r
+    }\r
+\r
+    template <class T> inline GpuMat::operator DevMem2D_<T>() const \r
+    { \r
+        return DevMem2D_<T>(rows, cols, (T*)data, step); \r
+    }\r
+\r
+    template <class T> inline GpuMat::operator PtrStep_<T>() const \r
+    { \r
+        return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); \r
+    }\r
+\r
+    inline GpuMat createContinuous(int rows, int cols, int type)\r
+    {\r
+        GpuMat m;\r
+        createContinuous(rows, cols, type, m);\r
+        return m;\r
+    }\r
+\r
+    inline void createContinuous(Size size, int type, GpuMat& m)\r
+    {\r
+        createContinuous(size.height, size.width, type, m);\r
+    }\r
+\r
+    inline GpuMat createContinuous(Size size, int type)\r
+    {\r
+        GpuMat m;\r
+        createContinuous(size, type, m);\r
+        return m;\r
+    }\r
+\r
+    inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)\r
+    {\r
+        ensureSizeIsEnough(size.height, size.width, type, m);\r
+    }\r
+\r
+    inline void createContinuous(int rows, int cols, int type, GpuMat& m)\r
+    {\r
+        int area = rows * cols;\r
+        if (!m.isContinuous() || m.type() != type || m.size().area() != area)\r
+            m.create(1, area, type);\r
+        m = m.reshape(0, rows);\r
+    }\r
+\r
+    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)\r
+    {\r
+        if (m.type() == type && m.rows >= rows && m.cols >= cols)\r
+            m = m(Rect(0, 0, cols, rows));\r
+        else\r
+            m.create(rows, cols, type);\r
+    }\r
+}}\r
+\r
+#endif // __OPENCV_GPUMAT_HPP__\r
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp

new file mode 100644 (file)

index 0000000..2dffee4
--- /dev/null
+++ b/modules/core/src/gpumat.cpp
@@ -0,0 +1,460 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////\r
+//\r
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
+//\r
+//  By downloading, copying, installing or using the software you agree to this license.\r
+//  If you do not agree to this license, do not download, install,\r
+//  copy or use the software.\r
+//\r
+//\r
+//                           License Agreement\r
+//                For Open Source Computer Vision Library\r
+//\r
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
+// Third party copyrights are property of their respective owners.\r
+//\r
+// Redistribution and use in source and binary forms, with or without modification,\r
+// are permitted provided that the following conditions are met:\r
+//\r
+//   * Redistribution's of source code must retain the above copyright notice,\r
+//     this list of conditions and the following disclaimer.\r
+//\r
+//   * Redistribution's in binary form must reproduce the above copyright notice,\r
+//     this list of conditions and the following disclaimer in the documentation\r
+//     and/or other materials provided with the distribution.\r
+//\r
+//   * The name of the copyright holders may not be used to endorse or promote products\r
+//     derived from this software without specific prior written permission.\r
+//\r
+// This software is provided by the copyright holders and contributors "as is" and\r
+// any express or implied warranties, including, but not limited to, the implied\r
+// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
+// In no event shall the Intel Corporation or contributors be liable for any direct,\r
+// indirect, incidental, special, exemplary, or consequential damages\r
+// (including, but not limited to, procurement of substitute goods or services;\r
+// loss of use, data, or profits; or business interruption) however caused\r
+// and on any theory of liability, whether in contract, strict liability,\r
+// or tort (including negligence or otherwise) arising in any way out of\r
+// the use of this software, even if advised of the possibility of such damage.\r
+//\r
+//M*/\r
+\r
+#include "precomp.hpp"\r
+#include "opencv2/core/gpumat.hpp"\r
+\r
+using namespace std;\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+\r
+cv::gpu::GpuMat::GpuMat(const GpuMat& m) \r
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)\r
+{\r
+    if (refcount)\r
+        CV_XADD(refcount, 1);\r
+}\r
+\r
+cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : \r
+    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_), \r
+    step(step_), data((uchar*)data_), refcount(0),\r
+    datastart((uchar*)data_), dataend((uchar*)data_)\r
+{\r
+    size_t minstep = cols * elemSize();\r
+\r
+    if (step == Mat::AUTO_STEP)\r
+    {\r
+        step = minstep;\r
+        flags |= Mat::CONTINUOUS_FLAG;\r
+    }\r
+    else\r
+    {\r
+        if (rows == 1) \r
+            step = minstep;\r
+\r
+        CV_DbgAssert(step >= minstep);\r
+\r
+        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;\r
+    }\r
+    dataend += step * (rows - 1) + minstep;\r
+}\r
+\r
+cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : \r
+    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width),\r
+    step(step_), data((uchar*)data_), refcount(0),\r
+    datastart((uchar*)data_), dataend((uchar*)data_)\r
+{\r
+    size_t minstep = cols * elemSize();\r
+\r
+    if (step == Mat::AUTO_STEP)\r
+    {\r
+        step = minstep;\r
+        flags |= Mat::CONTINUOUS_FLAG;\r
+    }\r
+    else\r
+    {\r
+        if (rows == 1) \r
+            step = minstep;\r
+\r
+        CV_DbgAssert(step >= minstep);\r
+\r
+        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;\r
+    }\r
+    dataend += step * (rows - 1) + minstep;\r
+}\r
+\r
+cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange, Range colRange)\r
+{\r
+    flags = m.flags;\r
+    step = m.step; refcount = m.refcount;\r
+    data = m.data; datastart = m.datastart; dataend = m.dataend;\r
+\r
+    if (rowRange == Range::all())\r
+        rows = m.rows;\r
+    else\r
+    {\r
+        CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows);\r
+\r
+        rows = rowRange.size();\r
+        data += step*rowRange.start;\r
+    }\r
+\r
+    if (colRange == Range::all())\r
+        cols = m.cols;\r
+    else\r
+    {\r
+        CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols);\r
+\r
+        cols = colRange.size();\r
+        data += colRange.start*elemSize();\r
+        flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;\r
+    }\r
+\r
+    if (rows == 1)\r
+        flags |= Mat::CONTINUOUS_FLAG;\r
+\r
+    if (refcount)\r
+        CV_XADD(refcount, 1);\r
+\r
+    if (rows <= 0 || cols <= 0)\r
+        rows = cols = 0;\r
+}\r
+\r
+cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) : \r
+    flags(m.flags), rows(roi.height), cols(roi.width),\r
+    step(m.step), data(m.data + roi.y*step), refcount(m.refcount),\r
+    datastart(m.datastart), dataend(m.dataend)\r
+{\r
+    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;\r
+    data += roi.x * elemSize();\r
+\r
+    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows);\r
+\r
+    if (refcount)\r
+        CV_XADD(refcount, 1);\r
+\r
+    if (rows <= 0 || cols <= 0)\r
+        rows = cols = 0;\r
+}\r
+\r
+cv::gpu::GpuMat::GpuMat(const Mat& m) : \r
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) \r
+{ \r
+    upload(m); \r
+}\r
+\r
+GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m)\r
+{\r
+    if (this != &m)\r
+    {\r
+        GpuMat temp(m);\r
+        swap(temp);\r
+    }\r
+\r
+    return *this;\r
+}\r
+\r
+void cv::gpu::GpuMat::swap(GpuMat& b)\r
+{\r
+    std::swap(flags, b.flags);\r
+    std::swap(rows, b.rows); \r
+    std::swap(cols, b.cols);\r
+    std::swap(step, b.step); \r
+    std::swap(data, b.data);\r
+    std::swap(datastart, b.datastart);\r
+    std::swap(dataend, b.dataend);\r
+    std::swap(refcount, b.refcount);\r
+}\r
+\r
+void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const\r
+{\r
+    size_t esz = elemSize();\r
+    ptrdiff_t delta1 = data - datastart;\r
+    ptrdiff_t delta2 = dataend - datastart;\r
+\r
+    CV_DbgAssert(step > 0);\r
+\r
+    if (delta1 == 0)\r
+        ofs.x = ofs.y = 0;\r
+    else\r
+    {\r
+        ofs.y = static_cast<int>(delta1 / step);\r
+        ofs.x = static_cast<int>((delta1 - step * ofs.y) / esz);\r
+\r
+        CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz);\r
+    }\r
+\r
+    size_t minstep = (ofs.x + cols) * esz;\r
+\r
+    wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / step + 1), ofs.y + rows);\r
+    wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);\r
+}\r
+\r
+GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)\r
+{\r
+    Size wholeSize; \r
+    Point ofs;\r
+    locateROI(wholeSize, ofs);\r
+\r
+    size_t esz = elemSize();\r
+\r
+    int row1 = std::max(ofs.y - dtop, 0); \r
+    int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);\r
+\r
+    int col1 = std::max(ofs.x - dleft, 0);\r
+    int col2 = std::min(ofs.x + cols + dright, wholeSize.width);\r
+\r
+    data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;\r
+    rows = row2 - row1; \r
+    cols = col2 - col1;\r
+\r
+    if (esz * cols == step || rows == 1)\r
+        flags |= Mat::CONTINUOUS_FLAG;\r
+    else\r
+        flags &= ~Mat::CONTINUOUS_FLAG;\r
+\r
+    return *this;\r
+}\r
+\r
+GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const\r
+{\r
+    GpuMat hdr = *this;\r
+\r
+    int cn = channels();\r
+    if (new_cn == 0)\r
+        new_cn = cn;\r
+\r
+    int total_width = cols * cn;\r
+\r
+    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)\r
+        new_rows = rows * total_width / new_cn;\r
+\r
+    if (new_rows != 0 && new_rows != rows)\r
+    {\r
+        int total_size = total_width * rows;\r
+\r
+        if (!isContinuous())\r
+            CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");\r
+\r
+        if ((unsigned)new_rows > (unsigned)total_size)\r
+            CV_Error(CV_StsOutOfRange, "Bad new number of rows");\r
+\r
+        total_width = total_size / new_rows;\r
+\r
+        if (total_width * new_rows != total_size)\r
+            CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");\r
+\r
+        hdr.rows = new_rows;\r
+        hdr.step = total_width * elemSize1();\r
+    }\r
+\r
+    int new_width = total_width / new_cn;\r
+\r
+    if (new_width * new_cn != total_width)\r
+        CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");\r
+\r
+    hdr.cols = new_width;\r
+    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);\r
+\r
+    return hdr;\r
+}\r
+\r
+cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)\r
+{\r
+    m.download(*this);\r
+}\r
+\r
+namespace\r
+{\r
+    void throw_nogpu() \r
+    { \r
+        CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); \r
+    }\r
+\r
+    class EmptyFuncTable : public GpuFuncTable\r
+    {\r
+    public:\r
+        void copy(const Mat&, GpuMat&) const { throw_nogpu(); }\r
+        void copy(const GpuMat&, Mat&) const { throw_nogpu(); }\r
+        void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); }\r
+\r
+        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); }\r
+\r
+        void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); }\r
+        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); }\r
+\r
+        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu(); }\r
+\r
+        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); }\r
+        void free(void*) const {}\r
+    };\r
+\r
+    const GpuFuncTable* g_funcTbl = 0;\r
+\r
+    const GpuFuncTable* gpuFuncTable()\r
+    {\r
+        static EmptyFuncTable empty;\r
+        return g_funcTbl ? g_funcTbl : &empty;\r
+    }\r
+}\r
+\r
+void cv::gpu::setGpuFuncTable(const GpuFuncTable* funcTbl)\r
+{\r
+    g_funcTbl = funcTbl;\r
+}\r
+\r
+void cv::gpu::GpuMat::upload(const Mat& m)\r
+{\r
+    CV_DbgAssert(!m.empty());\r
+\r
+    create(m.size(), m.type());\r
+\r
+    gpuFuncTable()->copy(m, *this);\r
+}\r
+\r
+void cv::gpu::GpuMat::download(Mat& m) const\r
+{\r
+    CV_DbgAssert(!empty());\r
+\r
+    m.create(size(), type());\r
+\r
+    gpuFuncTable()->copy(*this, m);\r
+}\r
+\r
+void cv::gpu::GpuMat::copyTo(GpuMat& m) const\r
+{\r
+    CV_DbgAssert(!empty());\r
+\r
+    m.create(size(), type());\r
+\r
+    gpuFuncTable()->copy(*this, m);\r
+}\r
+\r
+void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const\r
+{\r
+    if (mask.empty())\r
+        copyTo(mat);\r
+    else\r
+    {\r
+        mat.create(size(), type());\r
+\r
+        gpuFuncTable()->copyWithMask(*this, mat, mask);\r
+    }\r
+}\r
+\r
+void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const\r
+{\r
+    bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();\r
+\r
+    if (rtype < 0)\r
+        rtype = type();\r
+    else\r
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());\r
+\r
+    int sdepth = depth();\r
+    int ddepth = CV_MAT_DEPTH(rtype);\r
+    if (sdepth == ddepth && noScale)\r
+    {\r
+        copyTo(dst);\r
+        return;\r
+    }\r
+\r
+    GpuMat temp;\r
+    const GpuMat* psrc = this;\r
+    if (sdepth != ddepth && psrc == &dst)\r
+    {\r
+        temp = *this;\r
+        psrc = &temp;\r
+    }\r
+\r
+    dst.create(size(), rtype);\r
+\r
+    if (noScale)\r
+        gpuFuncTable()->convert(*psrc, dst);\r
+    else\r
+        gpuFuncTable()->convert(*psrc, dst, alpha, beta);\r
+}\r
+\r
+GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)\r
+{\r
+    CV_Assert(mask.empty() || mask.type() == CV_8UC1);\r
+    CV_DbgAssert(!empty());\r
+\r
+    gpuFuncTable()->setTo(*this, s, mask);    \r
+\r
+    return *this;\r
+}\r
+\r
+void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)\r
+{\r
+    _type &= TYPE_MASK;\r
+\r
+    if (rows == _rows && cols == _cols && type() == _type && data)\r
+        return;\r
+\r
+    if (data)\r
+        release();\r
+\r
+    CV_DbgAssert(_rows >= 0 && _cols >= 0);\r
+\r
+    if (_rows > 0 && _cols > 0)\r
+    {\r
+        flags = Mat::MAGIC_VAL + _type;\r
+        rows = _rows;\r
+        cols = _cols;\r
+\r
+        size_t esz = elemSize();\r
+\r
+        void* devPtr;\r
+        gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows);\r
+\r
+        // Single row must be continuous\r
+        if (rows == 1)\r
+            step = esz * cols;\r
+\r
+        if (esz * cols == step)\r
+            flags |= Mat::CONTINUOUS_FLAG;\r
+\r
+        int64 _nettosize = static_cast<int64>(step) * rows;\r
+        size_t nettosize = static_cast<size_t>(_nettosize);\r
+\r
+        datastart = data = static_cast<uchar*>(devPtr);\r
+        dataend = data + nettosize;\r
+\r
+        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));\r
+        *refcount = 1;\r
+    }\r
+}\r
+\r
+void cv::gpu::GpuMat::release()\r
+{\r
+    if (refcount && CV_XADD(refcount, -1) == 1)\r
+    {\r
+        fastFree(refcount);\r
+\r
+        gpuFuncTable()->free(datastart);\r
+    }\r
+\r
+    data = datastart = dataend = 0;\r
+    step = rows = cols = 0;\r
+    refcount = 0;\r
+}\r
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt

index 74ccc32..fcef9b9 100644 (file)
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,8 @@ set(name "gpu")
  set(the_target "opencv_${name}")
  project(${the_target})
  
-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed 
+set(DEPS "opencv_core" "opencv_imgproc" "opencv_calib3d" "opencv_objdetect")
+set(DEPS_HEADER ${DEPS} "opencv_features2d" "opencv_flann")
  set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
  
  include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
@@ -27,6 +28,13 @@ file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
  source_group("Device" FILES ${lib_device_hdrs})
  source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
  
+foreach(d ${DEPS_HEADER})
+       if(${d} MATCHES "opencv_")
+               string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
+               include_directories("${d_dir}/include")
+       endif()
+endforeach()
+
  if (HAVE_CUDA)
      file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")     
      file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
@@ -50,7 +58,6 @@ if (HAVE_CUDA)
      if (APPLE)
          set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;")
      endif()
-    
  
      string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
      string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
@@ -60,7 +67,7 @@ if (HAVE_CUDA)
          #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
          #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
          #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408 /wd4251")
  
          string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
          string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
@@ -69,22 +76,19 @@ if (HAVE_CUDA)
          string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
          string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
      endif()
-       
+
      if (BUILD_SHARED_LIBS)
-               set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
-       endif()
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
+    endif()
      
+    if(MSVC)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/wd4251")
+    endif()
+
      CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
      #CUDA_BUILD_CLEAN_TARGET()
  endif()
  
-foreach(d ${DEPS})
-       if(${d} MATCHES "opencv_")
-               string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-               include_directories("${d_dir}/include")
-       endif()
-endforeach()
-
  add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
  
  # For dynamic link numbering convenions
diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp

index e454f00..33af66a 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -40,122 +40,4 @@
  //\r
  //M*/\r
  \r
-#ifndef __OPENCV_GPU_DevMem2D_HPP__\r
-#define __OPENCV_GPU_DevMem2D_HPP__\r
-\r
-\r
-namespace cv\r
-{    \r
-    namespace gpu\r
-    {\r
-        // Simple lightweight structures that encapsulates information about an image on device.\r
-        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile\r
-\r
-#if defined(__CUDACC__) \r
-    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ \r
-#else\r
-    #define __CV_GPU_HOST_DEVICE__\r
-#endif\r
-\r
-        template <bool expr> struct StaticAssert;\r
-        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        \r
-\r
-               template<typename T> struct DevPtr\r
-               {\r
-                       typedef T elem_type;\r
-                       typedef int index_type;\r
-\r
-                       enum { elem_size = sizeof(elem_type) };\r
-\r
-                       T* data;\r
-\r
-                       __CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}\r
-                       __CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}\r
-\r
-                       __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }\r
-                       __CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }\r
-                       __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }\r
-               };\r
-               \r
-               template<typename T> struct PtrSz : public DevPtr<T>\r
-        {                     \r
-            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}\r
-            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}\r
-\r
-            size_t size;\r
-        };\r
-\r
-               template<typename T> struct PtrStep : public DevPtr<T>\r
-        {   \r
-            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}\r
-                       __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}\r
-\r
-            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */\r
-            size_t step;            \r
-\r
-                       __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }\r
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }\r
-\r
-                       __CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }\r
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }\r
-        };\r
-\r
-               template <typename T> struct PtrStepSz : public PtrStep<T>\r
-        {   \r
-            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}\r
-            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) \r
-                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}\r
-\r
-            int cols;\r
-            int rows;                                                                              \r
-        };\r
-\r
-               template <typename T> struct DevMem2D_ : public PtrStepSz<T>\r
-        {            \r
-            DevMem2D_() {}\r
-                       DevMem2D_(int rows_, int cols_, T *data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}\r
-                            \r
-            template <typename U>            \r
-                       explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}                                                                \r
-        };\r
-                              \r
-        template<typename T> struct PtrElemStep_ : public PtrStep<T>\r
-        {                   \r
-            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) \r
-            {\r
-                StaticAssert<256 % sizeof(T) == 0>::check();\r
-\r
-                PtrStep<T>::step /= PtrStep<T>::elem_size;             \r
-            }\r
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }\r
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }  \r
-\r
-            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }\r
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  \r
-        };\r
-\r
-               template<typename T> struct PtrStep_ : public PtrStep<T>\r
-        {            \r
-            PtrStep_() {}\r
-            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}                        \r
-        };\r
-\r
-#undef __CV_GPU_HOST_DEVICE__\r
-\r
-\r
-        typedef DevMem2D_<unsigned char> DevMem2Db;\r
-               typedef DevMem2Db DevMem2D;\r
-        typedef DevMem2D_<float> DevMem2Df;\r
-        typedef DevMem2D_<int> DevMem2Di;\r
-\r
-        typedef PtrStep<unsigned char> PtrStepb;\r
-        typedef PtrStep<float> PtrStepf;\r
-        typedef PtrStep<int> PtrStepi;\r
-\r
-        typedef PtrElemStep_<unsigned char> PtrElemStep;\r
-        typedef PtrElemStep_<float> PtrElemStepf;\r
-        typedef PtrElemStep_<int> PtrElemStepi;                \r
-    }    \r
-}\r
-\r
-#endif /* __OPENCV_GPU_DevMem2D_HPP__ */\r
+#include "opencv2/core/devmem2d.hpp"\r
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp

index 38f6a95..ffa32fb 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -43,1520 +43,1530 @@
  #ifndef __OPENCV_GPU_HPP__\r
  #define __OPENCV_GPU_HPP__\r
  \r
+#ifndef SKIP_INCLUDES\r
  #include <vector>\r
-#include "opencv2/core/core.hpp"\r
+#endif\r
+\r
+#include "opencv2/core/gpumat.hpp"\r
  #include "opencv2/imgproc/imgproc.hpp"\r
  #include "opencv2/objdetect/objdetect.hpp"\r
  #include "opencv2/features2d/features2d.hpp"\r
-#include "opencv2/gpu/gpumat.hpp"\r
  \r
-namespace cv\r
-{\r
-    namespace gpu\r
-    {\r
-        //////////////////////////////// Initialization & Info ////////////////////////\r
-\r
-        //! This is the only function that do not throw exceptions if the library is compiled without Cuda.\r
-        CV_EXPORTS int getCudaEnabledDeviceCount();\r
-\r
-        //! Functions below throw cv::Expception if the library is compiled without Cuda.\r
-\r
-        CV_EXPORTS void setDevice(int device);\r
-        CV_EXPORTS int getDevice();\r
-\r
-        //! Explicitly destroys and cleans up all resources associated with the current device in the current process. \r
-        //! Any subsequent API call to this device will reinitialize the device.\r
-        CV_EXPORTS void resetDevice();\r
-\r
-        enum FeatureSet\r
-        {\r
-            FEATURE_SET_COMPUTE_10 = 10,\r
-            FEATURE_SET_COMPUTE_11 = 11,\r
-            FEATURE_SET_COMPUTE_12 = 12,\r
-            FEATURE_SET_COMPUTE_13 = 13,\r
-            FEATURE_SET_COMPUTE_20 = 20,\r
-            FEATURE_SET_COMPUTE_21 = 21,\r
-            GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,\r
-            SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,\r
-            NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13\r
-        };\r
+namespace cv { namespace gpu {\r
  \r
-        // Gives information about what GPU archs this OpenCV GPU module was \r
-        // compiled for\r
-        class CV_EXPORTS TargetArchs\r
-        {\r
-        public:\r
-            static bool builtWith(FeatureSet feature_set);\r
-            static bool has(int major, int minor);\r
-            static bool hasPtx(int major, int minor);\r
-            static bool hasBin(int major, int minor);\r
-            static bool hasEqualOrLessPtx(int major, int minor);\r
-            static bool hasEqualOrGreater(int major, int minor);\r
-            static bool hasEqualOrGreaterPtx(int major, int minor);\r
-            static bool hasEqualOrGreaterBin(int major, int minor);\r
-        private:\r
-            TargetArchs();\r
-        };\r
+//////////////////////////////// Initialization & Info ////////////////////////\r
  \r
-        // Gives information about the given GPU\r
-        class CV_EXPORTS DeviceInfo\r
-        {\r
-        public:\r
-            // Creates DeviceInfo object for the current GPU\r
-            DeviceInfo() : device_id_(getDevice()) { query(); }\r
+//! This is the only function that do not throw exceptions if the library is compiled without Cuda.\r
+CV_EXPORTS int getCudaEnabledDeviceCount();\r
  \r
-            // Creates DeviceInfo object for the given GPU\r
-            DeviceInfo(int device_id) : device_id_(device_id) { query(); }\r
+//! Functions below throw cv::Expception if the library is compiled without Cuda.\r
  \r
-            string name() const { return name_; }\r
-\r
-            // Return compute capability versions\r
-            int majorVersion() const { return majorVersion_; }\r
-            int minorVersion() const { return minorVersion_; }\r
+CV_EXPORTS void setDevice(int device);\r
+CV_EXPORTS int getDevice();\r
  \r
-            int multiProcessorCount() const { return multi_processor_count_; }\r
+//! Explicitly destroys and cleans up all resources associated with the current device in the current process. \r
+//! Any subsequent API call to this device will reinitialize the device.\r
+CV_EXPORTS void resetDevice();\r
  \r
-            size_t freeMemory() const;\r
-            size_t totalMemory() const;\r
+enum FeatureSet\r
+{\r
+    FEATURE_SET_COMPUTE_10 = 10,\r
+    FEATURE_SET_COMPUTE_11 = 11,\r
+    FEATURE_SET_COMPUTE_12 = 12,\r
+    FEATURE_SET_COMPUTE_13 = 13,\r
+    FEATURE_SET_COMPUTE_20 = 20,\r
+    FEATURE_SET_COMPUTE_21 = 21,\r
+    GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,\r
+    SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,\r
+    NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13\r
+};\r
+\r
+// Gives information about what GPU archs this OpenCV GPU module was \r
+// compiled for\r
+class CV_EXPORTS TargetArchs\r
+{\r
+public:\r
+    static bool builtWith(FeatureSet feature_set);\r
+    static bool has(int major, int minor);\r
+    static bool hasPtx(int major, int minor);\r
+    static bool hasBin(int major, int minor);\r
+    static bool hasEqualOrLessPtx(int major, int minor);\r
+    static bool hasEqualOrGreater(int major, int minor);\r
+    static bool hasEqualOrGreaterPtx(int major, int minor);\r
+    static bool hasEqualOrGreaterBin(int major, int minor);\r
+private:\r
+    TargetArchs();\r
+};\r
+\r
+// Gives information about the given GPU\r
+class CV_EXPORTS DeviceInfo\r
+{\r
+public:\r
+    // Creates DeviceInfo object for the current GPU\r
+    DeviceInfo() : device_id_(getDevice()) { query(); }\r
  \r
-            // Checks whether device supports the given feature\r
-            bool supports(FeatureSet feature_set) const;\r
+    // Creates DeviceInfo object for the given GPU\r
+    DeviceInfo(int device_id) : device_id_(device_id) { query(); }\r
  \r
-            // Checks whether the GPU module can be run on the given device\r
-            bool isCompatible() const;\r
+    std::string name() const { return name_; }\r
  \r
-            int deviceID() const { return device_id_; }\r
+    // Return compute capability versions\r
+    int majorVersion() const { return majorVersion_; }\r
+    int minorVersion() const { return minorVersion_; }\r
  \r
-        private:\r
-            void query();\r
-            void queryMemory(size_t& free_memory, size_t& total_memory) const;\r
+    int multiProcessorCount() const { return multi_processor_count_; }\r
  \r
-            int device_id_;\r
+    size_t freeMemory() const;\r
+    size_t totalMemory() const;\r
  \r
-            string name_;\r
-            int multi_processor_count_;\r
-            int majorVersion_;\r
-            int minorVersion_;\r
-        };\r
+    // Checks whether device supports the given feature\r
+    bool supports(FeatureSet feature_set) const;\r
  \r
-        //////////////////////////////// Error handling ////////////////////////\r
+    // Checks whether the GPU module can be run on the given device\r
+    bool isCompatible() const;\r
  \r
-        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);\r
+    int deviceID() const { return device_id_; }\r
  \r
-        //////////////////////////////// CudaMem ////////////////////////////////\r
-        // CudaMem is limited cv::Mat with page locked memory allocation.\r
-        // Page locked memory is only needed for async and faster coping to GPU.\r
-        // It is convertable to cv::Mat header without reference counting\r
-        // so you can use it with other opencv functions.\r
+private:\r
+    void query();\r
+    void queryMemory(size_t& free_memory, size_t& total_memory) const;\r
  \r
-        // Page-locks the matrix m memory and maps it for the device(s)\r
-        CV_EXPORTS void registerPageLocked(Mat& m);\r
-        // Unmaps the memory of matrix m, and makes it pageable again.\r
-        CV_EXPORTS void unregisterPageLocked(Mat& m);\r
+    int device_id_;\r
  \r
-        class CV_EXPORTS CudaMem\r
-        {\r
-        public:\r
-            enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };\r
+    std::string name_;\r
+    int multi_processor_count_;\r
+    int majorVersion_;\r
+    int minorVersion_;\r
+};\r
  \r
-            CudaMem();\r
-            CudaMem(const CudaMem& m);\r
+//////////////////////////////// Error handling ////////////////////////\r
  \r
-            CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);\r
-            CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);\r
+CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);\r
  \r
+//////////////////////////////// CudaMem ////////////////////////////////\r
+// CudaMem is limited cv::Mat with page locked memory allocation.\r
+// Page locked memory is only needed for async and faster coping to GPU.\r
+// It is convertable to cv::Mat header without reference counting\r
+// so you can use it with other opencv functions.\r
  \r
-            //! creates from cv::Mat with coping data\r
-            explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);\r
+// Page-locks the matrix m memory and maps it for the device(s)\r
+CV_EXPORTS void registerPageLocked(Mat& m);\r
+// Unmaps the memory of matrix m, and makes it pageable again.\r
+CV_EXPORTS void unregisterPageLocked(Mat& m);\r
  \r
-            ~CudaMem();\r
+class CV_EXPORTS CudaMem\r
+{\r
+public:\r
+    enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };\r
  \r
-            CudaMem& operator = (const CudaMem& m);\r
+    CudaMem();\r
+    CudaMem(const CudaMem& m);\r
  \r
-            //! returns deep copy of the matrix, i.e. the data is copied\r
-            CudaMem clone() const;\r
+    CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);\r
+    CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);\r
  \r
-            //! allocates new matrix data unless the matrix already has specified size and type.\r
-            void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);\r
-            void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);\r
  \r
-            //! decrements reference counter and released memory if needed.\r
-            void release();\r
+    //! creates from cv::Mat with coping data\r
+    explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);\r
  \r
-            //! returns matrix header with disabled reference counting for CudaMem data.\r
-            Mat createMatHeader() const;\r
-            operator Mat() const;\r
+    ~CudaMem();\r
  \r
-            //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.\r
-            GpuMat createGpuMatHeader() const;\r
-            operator GpuMat() const;\r
+    CudaMem& operator = (const CudaMem& m);\r
  \r
-            //returns if host memory can be mapperd to gpu address space;\r
-            static bool canMapHostMemory();\r
+    //! returns deep copy of the matrix, i.e. the data is copied\r
+    CudaMem clone() const;\r
  \r
-            // Please see cv::Mat for descriptions\r
-            bool isContinuous() const;\r
-            size_t elemSize() const;\r
-            size_t elemSize1() const;\r
-            int type() const;\r
-            int depth() const;\r
-            int channels() const;\r
-            size_t step1() const;\r
-            Size size() const;\r
-            bool empty() const;\r
+    //! allocates new matrix data unless the matrix already has specified size and type.\r
+    void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);\r
+    void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);\r
  \r
+    //! decrements reference counter and released memory if needed.\r
+    void release();\r
  \r
-            // Please see cv::Mat for descriptions\r
-            int flags;\r
-            int rows, cols;\r
-            size_t step;\r
+    //! returns matrix header with disabled reference counting for CudaMem data.\r
+    Mat createMatHeader() const;\r
+    operator Mat() const;\r
  \r
-            uchar* data;\r
-            int* refcount;\r
+    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.\r
+    GpuMat createGpuMatHeader() const;\r
+    operator GpuMat() const;\r
  \r
-            uchar* datastart;\r
-            uchar* dataend;\r
+    //returns if host memory can be mapperd to gpu address space;\r
+    static bool canMapHostMemory();\r
  \r
-            int alloc_type;\r
-        };\r
+    // Please see cv::Mat for descriptions\r
+    bool isContinuous() const;\r
+    size_t elemSize() const;\r
+    size_t elemSize1() const;\r
+    int type() const;\r
+    int depth() const;\r
+    int channels() const;\r
+    size_t step1() const;\r
+    Size size() const;\r
+    bool empty() const;\r
  \r
-        //////////////////////////////// CudaStream ////////////////////////////////\r
-        // Encapculates Cuda Stream. Provides interface for async coping.\r
-        // Passed to each function that supports async kernel execution.\r
-        // Reference counting is enabled\r
  \r
-        class CV_EXPORTS Stream\r
-        {\r
-        public:\r
-            Stream();\r
-            ~Stream();\r
+    // Please see cv::Mat for descriptions\r
+    int flags;\r
+    int rows, cols;\r
+    size_t step;\r
+\r
+    uchar* data;\r
+    int* refcount;\r
+\r
+    uchar* datastart;\r
+    uchar* dataend;\r
+\r
+    int alloc_type;\r
+};\r
+\r
+//////////////////////////////// CudaStream ////////////////////////////////\r
+// Encapculates Cuda Stream. Provides interface for async coping.\r
+// Passed to each function that supports async kernel execution.\r
+// Reference counting is enabled\r
+\r
+class CV_EXPORTS Stream\r
+{\r
+public:\r
+    Stream();\r
+    ~Stream();\r
  \r
-            Stream(const Stream&);\r
-            Stream& operator=(const Stream&);\r
+    Stream(const Stream&);\r
+    Stream& operator=(const Stream&);\r
  \r
-            bool queryIfComplete();\r
-            void waitForCompletion();\r
+    bool queryIfComplete();\r
+    void waitForCompletion();\r
  \r
-            //! downloads asynchronously.\r
-            // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)\r
-            void enqueueDownload(const GpuMat& src, CudaMem& dst);\r
-            void enqueueDownload(const GpuMat& src, Mat& dst);\r
+    //! downloads asynchronously.\r
+    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)\r
+    void enqueueDownload(const GpuMat& src, CudaMem& dst);\r
+    void enqueueDownload(const GpuMat& src, Mat& dst);\r
  \r
-            //! uploads asynchronously.\r
-            // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)\r
-            void enqueueUpload(const CudaMem& src, GpuMat& dst);\r
-            void enqueueUpload(const Mat& src, GpuMat& dst);\r
+    //! uploads asynchronously.\r
+    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)\r
+    void enqueueUpload(const CudaMem& src, GpuMat& dst);\r
+    void enqueueUpload(const Mat& src, GpuMat& dst);\r
  \r
-            void enqueueCopy(const GpuMat& src, GpuMat& dst);\r
+    void enqueueCopy(const GpuMat& src, GpuMat& dst);\r
  \r
-            void enqueueMemSet(GpuMat& src, Scalar val);\r
-            void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);\r
+    void enqueueMemSet(GpuMat& src, Scalar val);\r
+    void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);\r
  \r
-            // converts matrix type, ex from float to uchar depending on type\r
-            void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);\r
+    // converts matrix type, ex from float to uchar depending on type\r
+    void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);\r
  \r
-            static Stream& Null();\r
+    static Stream& Null();\r
  \r
-            operator bool() const;\r
+    operator bool() const;\r
  \r
-        private:\r
-            void create();\r
-            void release();\r
+private:\r
+    void create();\r
+    void release();\r
  \r
-            struct Impl;\r
-            Impl *impl;\r
+    struct Impl;\r
+    Impl *impl;\r
  \r
-            friend struct StreamAccessor;\r
-            \r
-            explicit Stream(Impl* impl);\r
-        };\r
+    friend struct StreamAccessor;\r
+    \r
+    explicit Stream(Impl* impl);\r
+};\r
          \r
  \r
-        //////////////////////////////// Filter Engine ////////////////////////////////\r
-\r
-        /*!\r
-        The Base Class for 1D or Row-wise Filters\r
-\r
-        This is the base class for linear or non-linear filters that process 1D data.\r
-        In particular, such filters are used for the "horizontal" filtering parts in separable filters.\r
-        */\r
-        class CV_EXPORTS BaseRowFilter_GPU\r
-        {\r
-        public:\r
-            BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}\r
-            virtual ~BaseRowFilter_GPU() {}\r
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;\r
-            int ksize, anchor;\r
-        };\r
-\r
-        /*!\r
-        The Base Class for Column-wise Filters\r
-\r
-        This is the base class for linear or non-linear filters that process columns of 2D arrays.\r
-        Such filters are used for the "vertical" filtering parts in separable filters.\r
-        */\r
-        class CV_EXPORTS BaseColumnFilter_GPU\r
-        {\r
-        public:\r
-            BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}\r
-            virtual ~BaseColumnFilter_GPU() {}\r
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;\r
-            int ksize, anchor;\r
-        };\r
-\r
-        /*!\r
-        The Base Class for Non-Separable 2D Filters.\r
-\r
-        This is the base class for linear or non-linear 2D filters.\r
-        */\r
-        class CV_EXPORTS BaseFilter_GPU\r
-        {\r
-        public:\r
-            BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}\r
-            virtual ~BaseFilter_GPU() {}\r
-            virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;\r
-            Size ksize;\r
-            Point anchor;\r
-        };\r
-\r
-        /*!\r
-        The Base Class for Filter Engine.\r
-\r
-        The class can be used to apply an arbitrary filtering operation to an image.\r
-        It contains all the necessary intermediate buffers.\r
-        */\r
-        class CV_EXPORTS FilterEngine_GPU\r
-        {\r
-        public:\r
-            virtual ~FilterEngine_GPU() {}\r
-\r
-            virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;\r
-        };\r
-\r
-        //! returns the non-separable filter engine with the specified filter\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);\r
-\r
-        //! returns the separable filter engine with the specified filters\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,\r
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,\r
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);\r
-\r
-        //! returns horizontal 1D box filter\r
-        //! supports only CV_8UC1 source type and CV_32FC1 sum type\r
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);\r
-\r
-        //! returns vertical 1D box filter\r
-        //! supports only CV_8UC1 sum type and CV_32FC1 dst type\r
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);\r
-\r
-        //! returns 2D box filter\r
-        //! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type\r
-        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));\r
-\r
-        //! returns box filter engine\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,\r
-            const Point& anchor = Point(-1,-1));\r
-\r
-        //! returns 2D morphological filter\r
-        //! only MORPH_ERODE and MORPH_DILATE are supported\r
-        //! supports CV_8UC1 and CV_8UC4 types\r
-        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height\r
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,\r
-            Point anchor=Point(-1,-1));\r
-\r
-        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,\r
-            const Point& anchor = Point(-1,-1), int iterations = 1);\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,\r
-            const Point& anchor = Point(-1,-1), int iterations = 1);\r
-\r
-        //! returns 2D filter with the specified kernel\r
-        //! supports CV_8UC1 and CV_8UC4 types\r
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,\r
-            Point anchor = Point(-1, -1));\r
-\r
-        //! returns the non-separable linear filter engine\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,\r
-            const Point& anchor = Point(-1,-1));\r
-\r
-        //! returns the primitive row filter with the specified kernel.\r
-        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.\r
-        //! there are two version of algorithm: NPP and OpenCV.\r
-        //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,\r
-        //! otherwise calls OpenCV version.\r
-        //! NPP supports only BORDER_CONSTANT border type.\r
-        //! OpenCV version supports only CV_32F as buffer depth and\r
-        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.\r
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,\r
-            int anchor = -1, int borderType = BORDER_DEFAULT);\r
-\r
-        //! returns the primitive column filter with the specified kernel.\r
-        //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.\r
-        //! there are two version of algorithm: NPP and OpenCV.\r
-        //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,\r
-        //! otherwise calls OpenCV version.\r
-        //! NPP supports only BORDER_CONSTANT border type.\r
-        //! OpenCV version supports only CV_32F as buffer depth and\r
-        //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.\r
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,\r
-            int anchor = -1, int borderType = BORDER_DEFAULT);\r
-\r
-        //! returns the separable linear filter engine\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,\r
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,\r
-            int columnBorderType = -1);\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,\r
-            const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,\r
-            int columnBorderType = -1);\r
-\r
-        //! returns filter engine for the generalized Sobel operator\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-\r
-        //! returns the Gaussian filter engine\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-\r
-        //! returns maximum filter\r
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));\r
-\r
-        //! returns minimum filter\r
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));\r
-\r
-        //! smooths the image using the normalized box filter\r
-        //! supports CV_8UC1, CV_8UC4 types\r
-        CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());\r
-\r
-        //! a synonym for normalized box filter\r
-        static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }\r
-\r
-        //! erodes the image (applies the local minimum operator)\r
-        CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);\r
-        CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());\r
-\r
-        //! dilates the image (applies the local maximum operator)\r
-        CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);\r
-        CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());\r
-\r
-        //! applies an advanced morphological operation to the image\r
-        CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);\r
-        CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());\r
-\r
-        //! applies non-separable 2D linear filter to the image\r
-        CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());\r
-\r
-        //! applies separable 2D linear filter to the image\r
-        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,\r
-            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,\r
-            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
-\r
-        //! applies generalized Sobel operator to the image\r
-        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
-\r
-        //! applies the vertical or horizontal Scharr operator to the image\r
-        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-        CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
-\r
-        //! smooths the image using Gaussian filter.\r
-        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
-        CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,\r
-            int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
-\r
-        //! applies Laplacian operator to the image\r
-        //! supports only ksize = 1 and ksize = 3\r
-        CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());\r
-\r
-\r
-        ////////////////////////////// Arithmetics ///////////////////////////////////\r
-\r
-        //! implements generalized matrix product algorithm GEMM from BLAS\r
-        CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, \r
-            const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());\r
-\r
-        //! transposes the matrix\r
-        //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)\r
-        CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());\r
-\r
-        //! reverses the order of the rows, columns or both in a matrix\r
-        //! supports CV_8UC1, CV_8UC4 types\r
-        CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());\r
-\r
-        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))\r
-        //! destination array will have the depth type as lut and the same channels number as source\r
-        //! supports CV_8UC1, CV_8UC3 types\r
-        CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());\r
-\r
-        //! makes multi-channel array out of several single-channel arrays\r
-        CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());\r
-\r
-        //! makes multi-channel array out of several single-channel arrays\r
-        CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());\r
-\r
-        //! copies each plane of a multi-channel array to a dedicated array\r
-        CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());\r
-\r
-        //! copies each plane of a multi-channel array to a dedicated array\r
-        CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null());\r
-\r
-        //! computes magnitude of complex (x(i).re, x(i).im) vector\r
-        //! supports only CV_32FC2 type\r
-        CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null());\r
-\r
-        //! computes squared magnitude of complex (x(i).re, x(i).im) vector\r
-        //! supports only CV_32FC2 type\r
-        CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null());\r
-\r
-        //! computes magnitude of each (x(i), y(i)) vector\r
-        //! supports only floating-point source\r
-        CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());\r
-\r
-        //! computes squared magnitude of each (x(i), y(i)) vector\r
-        //! supports only floating-point source\r
-        CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());\r
-\r
-        //! computes angle (angle(i)) of each (x(i), y(i)) vector\r
-        //! supports only floating-point source\r
-        CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());\r
-\r
-        //! converts Cartesian coordinates to polar\r
-        //! supports only floating-point source\r
-        CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());\r
-\r
-        //! converts polar coordinates to Cartesian\r
-        //! supports only floating-point source\r
-        CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());\r
+//////////////////////////////// Filter Engine ////////////////////////////////\r
  \r
+/*!\r
+The Base Class for 1D or Row-wise Filters\r
  \r
-        //////////////////////////// Per-element operations ////////////////////////////////////\r
-\r
-        //! adds one matrix to another (c = a + b)\r
-        CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
-        //! adds scalar to a matrix (c = a + s)\r
-        CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
-\r
-        //! subtracts one matrix from another (c = a - b)\r
-        CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
-        //! subtracts scalar from a matrix (c = a - s)\r
-        CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
-\r
-        //! computes element-wise weighted product of the two arrays (c = scale * a * b)\r
-        CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
-        //! weighted multiplies matrix to a scalar (c = scale * a * s)\r
-        CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
+This is the base class for linear or non-linear filters that process 1D data.\r
+In particular, such filters are used for the "horizontal" filtering parts in separable filters.\r
+*/\r
+class CV_EXPORTS BaseRowFilter_GPU\r
+{\r
+public:\r
+    BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}\r
+    virtual ~BaseRowFilter_GPU() {}\r
+    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;\r
+    int ksize, anchor;\r
+};\r
+\r
+/*!\r
+The Base Class for Column-wise Filters\r
+\r
+This is the base class for linear or non-linear filters that process columns of 2D arrays.\r
+Such filters are used for the "vertical" filtering parts in separable filters.\r
+*/\r
+class CV_EXPORTS BaseColumnFilter_GPU\r
+{\r
+public:\r
+    BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}\r
+    virtual ~BaseColumnFilter_GPU() {}\r
+    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;\r
+    int ksize, anchor;\r
+};\r
+\r
+/*!\r
+The Base Class for Non-Separable 2D Filters.\r
+\r
+This is the base class for linear or non-linear 2D filters.\r
+*/\r
+class CV_EXPORTS BaseFilter_GPU\r
+{\r
+public:\r
+    BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}\r
+    virtual ~BaseFilter_GPU() {}\r
+    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;\r
+    Size ksize;\r
+    Point anchor;\r
+};\r
+\r
+/*!\r
+The Base Class for Filter Engine.\r
+\r
+The class can be used to apply an arbitrary filtering operation to an image.\r
+It contains all the necessary intermediate buffers.\r
+*/\r
+class CV_EXPORTS FilterEngine_GPU\r
+{\r
+public:\r
+    virtual ~FilterEngine_GPU() {}\r
+\r
+    virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;\r
+};\r
+\r
+//! returns the non-separable filter engine with the specified filter\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);\r
+\r
+//! returns the separable filter engine with the specified filters\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,\r
+    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,\r
+    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);\r
+\r
+//! returns horizontal 1D box filter\r
+//! supports only CV_8UC1 source type and CV_32FC1 sum type\r
+CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);\r
+\r
+//! returns vertical 1D box filter\r
+//! supports only CV_8UC1 sum type and CV_32FC1 dst type\r
+CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);\r
+\r
+//! returns 2D box filter\r
+//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type\r
+CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));\r
+\r
+//! returns box filter engine\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,\r
+    const Point& anchor = Point(-1,-1));\r
+\r
+//! returns 2D morphological filter\r
+//! only MORPH_ERODE and MORPH_DILATE are supported\r
+//! supports CV_8UC1 and CV_8UC4 types\r
+//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height\r
+CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,\r
+    Point anchor=Point(-1,-1));\r
+\r
+//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,\r
+    const Point& anchor = Point(-1,-1), int iterations = 1);\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,\r
+    const Point& anchor = Point(-1,-1), int iterations = 1);\r
+\r
+//! returns 2D filter with the specified kernel\r
+//! supports CV_8UC1 and CV_8UC4 types\r
+CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,\r
+    Point anchor = Point(-1, -1));\r
+\r
+//! returns the non-separable linear filter engine\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,\r
+    const Point& anchor = Point(-1,-1));\r
+\r
+//! returns the primitive row filter with the specified kernel.\r
+//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.\r
+//! there are two version of algorithm: NPP and OpenCV.\r
+//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,\r
+//! otherwise calls OpenCV version.\r
+//! NPP supports only BORDER_CONSTANT border type.\r
+//! OpenCV version supports only CV_32F as buffer depth and\r
+//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.\r
+CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,\r
+    int anchor = -1, int borderType = BORDER_DEFAULT);\r
+\r
+//! returns the primitive column filter with the specified kernel.\r
+//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.\r
+//! there are two version of algorithm: NPP and OpenCV.\r
+//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,\r
+//! otherwise calls OpenCV version.\r
+//! NPP supports only BORDER_CONSTANT border type.\r
+//! OpenCV version supports only CV_32F as buffer depth and\r
+//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.\r
+CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,\r
+    int anchor = -1, int borderType = BORDER_DEFAULT);\r
+\r
+//! returns the separable linear filter engine\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,\r
+    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,\r
+    int columnBorderType = -1);\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,\r
+    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,\r
+    int columnBorderType = -1);\r
+\r
+//! returns filter engine for the generalized Sobel operator\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,\r
+                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,\r
+                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+\r
+//! returns the Gaussian filter engine\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,\r
+                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,\r
+                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+\r
+//! returns maximum filter\r
+CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));\r
+\r
+//! returns minimum filter\r
+CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));\r
+\r
+//! smooths the image using the normalized box filter\r
+//! supports CV_8UC1, CV_8UC4 types\r
+CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());\r
+\r
+//! a synonym for normalized box filter\r
+static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) \r
+{ \r
+    boxFilter(src, dst, -1, ksize, anchor, stream); \r
+}\r
  \r
-        //! computes element-wise weighted quotient of the two arrays (c = a / b)\r
-        CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
-        //! computes element-wise weighted quotient of matrix and scalar (c = a / s)\r
-        CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
-        //! computes element-wise weighted reciprocal of an array (dst = scale/src2)\r
-        CV_EXPORTS void divide(double scale, const GpuMat& src2, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null());\r
+//! erodes the image (applies the local minimum operator)\r
+CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);\r
+CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, \r
+                      Point anchor = Point(-1, -1), int iterations = 1, \r
+                      Stream& stream = Stream::Null());\r
  \r
-        //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)\r
-        CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, \r
-            int dtype = -1, Stream& stream = Stream::Null());\r
+//! dilates the image (applies the local maximum operator)\r
+CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);\r
+CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, \r
+                       Point anchor = Point(-1, -1), int iterations = 1, \r
+                       Stream& stream = Stream::Null());\r
  \r
-        //! adds scaled array to another one (dst = alpha*src1 + src2)\r
-        static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())\r
-        {\r
-            addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);\r
-        }\r
+//! applies an advanced morphological operation to the image\r
+CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);\r
+CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, \r
+                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());\r
  \r
-        //! computes element-wise absolute difference of two arrays (c = abs(a - b))\r
-        CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());\r
-        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))\r
-        CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());\r
+//! applies non-separable 2D linear filter to the image\r
+CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());\r
  \r
-        //! computes exponent of each matrix element (b = e**a)\r
-        //! supports only CV_32FC1 type\r
-        CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());\r
-        \r
-        //! computes power of each matrix element:\r
-        //    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer\r
-        //    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise\r
-        //! supports all, except depth == CV_64F\r
-        CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());\r
+//! applies separable 2D linear filter to the image\r
+CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,\r
+                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,\r
+                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, \r
+                            Stream& stream = Stream::Null());\r
+\r
+//! applies generalized Sobel operator to the image\r
+CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,\r
+                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,\r
+                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
+\r
+//! applies the vertical or horizontal Scharr operator to the image\r
+CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,\r
+                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,\r
+                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
+\r
+//! smooths the image using Gaussian filter.\r
+CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,\r
+                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);\r
+CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,\r
+                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());\r
+\r
+//! applies Laplacian operator to the image\r
+//! supports only ksize = 1 and ksize = 3\r
+CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());\r
+\r
+\r
+////////////////////////////// Arithmetics ///////////////////////////////////\r
+\r
+//! implements generalized matrix product algorithm GEMM from BLAS\r
+CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, \r
+    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());\r
+\r
+//! transposes the matrix\r
+//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)\r
+CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+//! reverses the order of the rows, columns or both in a matrix\r
+//! supports CV_8UC1, CV_8UC4 types\r
+CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());\r
+\r
+//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))\r
+//! destination array will have the depth type as lut and the same channels number as source\r
+//! supports CV_8UC1, CV_8UC3 types\r
+CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+//! makes multi-channel array out of several single-channel arrays\r
+CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+//! makes multi-channel array out of several single-channel arrays\r
+CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+//! copies each plane of a multi-channel array to a dedicated array\r
+CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());\r
+\r
+//! copies each plane of a multi-channel array to a dedicated array\r
+CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null());\r
+\r
+//! computes magnitude of complex (x(i).re, x(i).im) vector\r
+//! supports only CV_32FC2 type\r
+CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null());\r
+\r
+//! computes squared magnitude of complex (x(i).re, x(i).im) vector\r
+//! supports only CV_32FC2 type\r
+CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null());\r
+\r
+//! computes magnitude of each (x(i), y(i)) vector\r
+//! supports only floating-point source\r
+CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());\r
+\r
+//! computes squared magnitude of each (x(i), y(i)) vector\r
+//! supports only floating-point source\r
+CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());\r
+\r
+//! computes angle (angle(i)) of each (x(i), y(i)) vector\r
+//! supports only floating-point source\r
+CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());\r
+\r
+//! converts Cartesian coordinates to polar\r
+//! supports only floating-point source\r
+CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());\r
+\r
+//! converts polar coordinates to Cartesian\r
+//! supports only floating-point source\r
+CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());\r
  \r
-        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))\r
-        //! supports only CV_32FC1 type\r
-        CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());\r
  \r
-        //! compares elements of two arrays (c = a <cmpop> b)\r
-        //! supports CV_8UC4, CV_32FC1 types\r
-        CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());\r
+//////////////////////////// Per-element operations ////////////////////////////////////\r
  \r
-        //! performs per-elements bit-wise inversion\r
-        CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
+//! adds one matrix to another (c = a + b)\r
+CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
+//! adds scalar to a matrix (c = a + s)\r
+CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
  \r
-        //! calculates per-element bit-wise disjunction of two arrays\r
-        CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
+//! subtracts one matrix from another (c = a - b)\r
+CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
+//! subtracts scalar from a matrix (c = a - s)\r
+CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());\r
  \r
-        //! calculates per-element bit-wise conjunction of two arrays\r
-        CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
+//! computes element-wise weighted product of the two arrays (c = scale * a * b)\r
+CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
+//! weighted multiplies matrix to a scalar (c = scale * a * s)\r
+CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
  \r
-        //! calculates per-element bit-wise "exclusive or" operation\r
-        CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
+//! computes element-wise weighted quotient of the two arrays (c = a / b)\r
+CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
+//! computes element-wise weighted quotient of matrix and scalar (c = a / s)\r
+CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());\r
+//! computes element-wise weighted reciprocal of an array (dst = scale/src2)\r
+CV_EXPORTS void divide(double scale, const GpuMat& src2, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null());\r
  \r
-        //! computes per-element minimum of two arrays (dst = min(src1, src2))\r
-        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());\r
+//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)\r
+CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, \r
+                            int dtype = -1, Stream& stream = Stream::Null());\r
  \r
-        //! computes per-element minimum of array and scalar (dst = min(src1, src2))\r
-        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());\r
+//! adds scaled array to another one (dst = alpha*src1 + src2)\r
+static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())\r
+{\r
+    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);\r
+}\r
  \r
-        //! computes per-element maximum of two arrays (dst = max(src1, src2))\r
-        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());\r
+//! computes element-wise absolute difference of two arrays (c = abs(a - b))\r
+CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());\r
+//! computes element-wise absolute difference of array and scalar (c = abs(a - s))\r
+CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());\r
  \r
-        //! computes per-element maximum of array and scalar (dst = max(src1, src2))\r
-        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());\r
+//! computes exponent of each matrix element (b = e**a)\r
+//! supports only CV_32FC1 type\r
+CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());\r
  \r
+//! computes power of each matrix element:\r
+//    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer\r
+//    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise\r
+//! supports all, except depth == CV_64F\r
+CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());\r
  \r
-        ////////////////////////////// Image processing //////////////////////////////\r
+//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))\r
+//! supports only CV_32FC1 type\r
+CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());\r
  \r
-        //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]\r
-        //! supports only CV_32FC1 map type\r
-        CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,\r
-            int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar(), \r
-            Stream& stream = Stream::Null());\r
+//! compares elements of two arrays (c = a <cmpop> b)\r
+//! supports CV_8UC4, CV_32FC1 types\r
+CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());\r
  \r
-        //! Does mean shift filtering on GPU.\r
-        CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,\r
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());\r
+//! performs per-elements bit-wise inversion\r
+CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
  \r
-        //! Does mean shift procedure on GPU.\r
-        CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,\r
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());\r
+//! calculates per-element bit-wise disjunction of two arrays\r
+CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
  \r
-        //! Does mean shift segmentation with elimination of small regions.\r
-        CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,\r
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));\r
+//! calculates per-element bit-wise conjunction of two arrays\r
+CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
  \r
-        //! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.\r
-        //! Supported types of input disparity: CV_8U, CV_16S.\r
-        //! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).\r
-        CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());\r
-\r
-        //! Reprojects disparity image to 3D space.\r
-        //! Supports CV_8U and CV_16S types of input disparity.\r
-        //! The output is a 4-channel floating-point (CV_32FC4) matrix.\r
-        //! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.\r
-        //! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.\r
-        CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream = Stream::Null());\r
-\r
-        //! converts image from one color space to another\r
-        CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());\r
-\r
-        //! applies fixed threshold to the image\r
-        CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());\r
-\r
-        //! resizes the image\r
-        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
-        CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());\r
-\r
-        //! warps the image using affine transformation\r
-        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
-        CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());\r
-\r
-        //! warps the image using perspective transformation\r
-        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
-        CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());\r
-\r
-        //! builds plane warping maps\r
-        CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale,\r
-                                           GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());\r
-\r
-        //! builds cylindrical warping maps\r
-        CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
-                                                 GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());\r
-\r
-        //! builds spherical warping maps\r
-        CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
-                                               GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());\r
-\r
-        //! rotate 8bit single or four channel image\r
-        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
-        //! supports CV_8UC1, CV_8UC4 types\r
-        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, \r
-            int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());\r
-\r
-        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant\r
-        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, \r
-            const Scalar& value = Scalar(), Stream& stream = Stream::Null());\r
-\r
-        //! computes the integral image\r
-        //! sum will have CV_32S type, but will contain unsigned int values\r
-        //! supports only CV_8UC1 source type\r
-        CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());\r
-\r
-        //! buffered version\r
-        CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());\r
-\r
-        //! computes the integral image and integral for the squared image\r
-        //! sum will have CV_32S type, sqsum - CV32F type\r
-        //! supports only CV_8UC1 source type\r
-        CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null());\r
-\r
-        //! computes squared integral image\r
-        //! result matrix will have 64F type, but will contain 64U values\r
-        //! supports source images of 8UC1 type only\r
-        CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());\r
-\r
-        //! computes vertical sum, supports only CV_32FC1 images\r
-        CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum);\r
-\r
-        //! computes the standard deviation of integral images\r
-        //! supports only CV_32SC1 source type and CV_32FC1 sqr type\r
-        //! output will have CV_32FC1 type\r
-        CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());\r
-\r
-        //! computes Harris cornerness criteria at each image pixel\r
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, \r
-            int borderType = BORDER_REFLECT101);\r
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, \r
-            int borderType = BORDER_REFLECT101);\r
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, \r
-            int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());\r
-\r
-        //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria\r
-        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);\r
-        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);\r
-        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, \r
-            int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());\r
-\r
-        //! performs per-element multiplication of two full (not packed) Fourier spectrums\r
-        //! supports 32FC2 matrixes only (interleaved format)\r
-        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());\r
-\r
-        //! performs per-element multiplication of two full (not packed) Fourier spectrums\r
-        //! supports 32FC2 matrixes only (interleaved format)\r
-        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());\r
-\r
-        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.\r
-        //! Param dft_size is the size of DFT transform.\r
-        //! \r
-        //! If the source matrix is not continous, then additional copy will be done,\r
-        //! so to avoid copying ensure the source matrix is continous one. If you want to use\r
-        //! preallocated output ensure it is continuous too, otherwise it will be reallocated.\r
-        //!\r
-        //! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values\r
-        //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.\r
-        //!\r
-        //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.\r
-        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());\r
-\r
-        //! computes convolution (or cross-correlation) of two images using discrete Fourier transform\r
-        //! supports source images of 32FC1 type only\r
-        //! result matrix will have 32FC1 type\r
-        struct CV_EXPORTS ConvolveBuf;\r
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);\r
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());\r
-\r
-        struct CV_EXPORTS ConvolveBuf\r
-        {\r
-            ConvolveBuf() {}\r
-            ConvolveBuf(Size image_size, Size templ_size) \r
-                { create(image_size, templ_size); }\r
-            void create(Size image_size, Size templ_size);\r
-            void create(Size image_size, Size templ_size, Size block_size);\r
-\r
-        private:\r
-            static Size estimateBlockSize(Size result_size, Size templ_size);\r
-            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);\r
-\r
-            Size result_size;\r
-            Size block_size;\r
-            Size dft_size;\r
-            int spect_len;\r
-\r
-            GpuMat image_spect, templ_spect, result_spect;\r
-            GpuMat image_block, templ_block, result_data;\r
-        };\r
-\r
-        //! computes the proximity map for the raster template and the image where the template is searched for\r
-        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());\r
-\r
-        //! smoothes the source image and downsamples it\r
-        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());\r
-\r
-        //! upsamples the source image and then smoothes it\r
-        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());\r
-\r
-        //! performs linear blending of two images\r
-        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero\r
-        CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, \r
-            GpuMat& result, Stream& stream = Stream::Null());\r
+//! calculates per-element bit-wise "exclusive or" operation\r
+CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());\r
  \r
-        \r
-        struct CV_EXPORTS CannyBuf;\r
-        \r
-        CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);\r
-        CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);\r
-        CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);\r
-        CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);\r
+//! computes per-element minimum of two arrays (dst = min(src1, src2))\r
+CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());\r
  \r
-        struct CV_EXPORTS CannyBuf\r
-        {\r
-            CannyBuf() {}\r
-            explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}\r
-            CannyBuf(const GpuMat& dx_, const GpuMat& dy_);\r
+//! computes per-element minimum of array and scalar (dst = min(src1, src2))\r
+CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());\r
  \r
-            void create(const Size& image_size, int apperture_size = 3);\r
-            \r
-            void release();\r
+//! computes per-element maximum of two arrays (dst = max(src1, src2))\r
+CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());\r
  \r
-            GpuMat dx, dy;\r
-            GpuMat dx_buf, dy_buf;\r
-            GpuMat edgeBuf;\r
-            GpuMat trackBuf1, trackBuf2;\r
-            Ptr<FilterEngine_GPU> filterDX, filterDY;\r
-        };\r
+//! computes per-element maximum of array and scalar (dst = max(src1, src2))\r
+CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+\r
+////////////////////////////// Image processing //////////////////////////////\r
+\r
+//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]\r
+//! supports only CV_32FC1 map type\r
+CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,\r
+                      int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar(), \r
+                      Stream& stream = Stream::Null());\r
+\r
+//! Does mean shift filtering on GPU.\r
+CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,\r
+                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), \r
+                                   Stream& stream = Stream::Null());\r
+\r
+//! Does mean shift procedure on GPU.\r
+CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,\r
+                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), \r
+                              Stream& stream = Stream::Null());\r
+\r
+//! Does mean shift segmentation with elimination of small regions.\r
+CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,\r
+                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));\r
+\r
+//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.\r
+//! Supported types of input disparity: CV_8U, CV_16S.\r
+//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).\r
+CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());\r
+\r
+//! Reprojects disparity image to 3D space.\r
+//! Supports CV_8U and CV_16S types of input disparity.\r
+//! The output is a 4-channel floating-point (CV_32FC4) matrix.\r
+//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.\r
+//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.\r
+CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream = Stream::Null());\r
+\r
+//! converts image from one color space to another\r
+CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());\r
+\r
+//! applies fixed threshold to the image\r
+CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());\r
+\r
+//! resizes the image\r
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
+CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());\r
+\r
+//! warps the image using affine transformation\r
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
+CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());\r
+\r
+//! warps the image using perspective transformation\r
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
+CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());\r
+\r
+//! builds plane warping maps\r
+CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale,\r
+                                   GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());\r
+\r
+//! builds cylindrical warping maps\r
+CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
+                                         GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());\r
+\r
+//! builds spherical warping maps\r
+CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
+                                       GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());\r
+\r
+//! rotate 8bit single or four channel image\r
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC\r
+//! supports CV_8UC1, CV_8UC4 types\r
+CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, \r
+                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());\r
+\r
+//! copies 2D array to a larger destination array and pads borders with user-specifiable constant\r
+CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, \r
+                               const Scalar& value = Scalar(), Stream& stream = Stream::Null());\r
+\r
+//! computes the integral image\r
+//! sum will have CV_32S type, but will contain unsigned int values\r
+//! supports only CV_8UC1 source type\r
+CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());\r
+\r
+//! buffered version\r
+CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());\r
+\r
+//! computes the integral image and integral for the squared image\r
+//! sum will have CV_32S type, sqsum - CV32F type\r
+//! supports only CV_8UC1 source type\r
+CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null());\r
+\r
+//! computes squared integral image\r
+//! result matrix will have 64F type, but will contain 64U values\r
+//! supports source images of 8UC1 type only\r
+CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());\r
+\r
+//! computes vertical sum, supports only CV_32FC1 images\r
+CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum);\r
+\r
+//! computes the standard deviation of integral images\r
+//! supports only CV_32SC1 source type and CV_32FC1 sqr type\r
+//! output will have CV_32FC1 type\r
+CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());\r
+\r
+//! computes Harris cornerness criteria at each image pixel\r
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);\r
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);\r
+CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, \r
+                             int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());\r
+\r
+//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria\r
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);\r
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);\r
+CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, \r
+    int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());\r
+\r
+//! performs per-element multiplication of two full (not packed) Fourier spectrums\r
+//! supports 32FC2 matrixes only (interleaved format)\r
+CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());\r
+\r
+//! performs per-element multiplication of two full (not packed) Fourier spectrums\r
+//! supports 32FC2 matrixes only (interleaved format)\r
+CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());\r
+\r
+//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.\r
+//! Param dft_size is the size of DFT transform.\r
+//! \r
+//! If the source matrix is not continous, then additional copy will be done,\r
+//! so to avoid copying ensure the source matrix is continous one. If you want to use\r
+//! preallocated output ensure it is continuous too, otherwise it will be reallocated.\r
+//!\r
+//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values\r
+//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.\r
+//!\r
+//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.\r
+CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());\r
+\r
+//! computes convolution (or cross-correlation) of two images using discrete Fourier transform\r
+//! supports source images of 32FC1 type only\r
+//! result matrix will have 32FC1 type\r
+struct CV_EXPORTS ConvolveBuf;\r
+CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);\r
+CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());\r
+\r
+struct CV_EXPORTS ConvolveBuf\r
+{\r
+    ConvolveBuf() {}\r
+    ConvolveBuf(Size image_size, Size templ_size) \r
+        { create(image_size, templ_size); }\r
+    void create(Size image_size, Size templ_size);\r
+    void create(Size image_size, Size templ_size, Size block_size);\r
+\r
+private:\r
+    static Size estimateBlockSize(Size result_size, Size templ_size);\r
+    friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);\r
+\r
+    Size result_size;\r
+    Size block_size;\r
+    Size dft_size;\r
+    int spect_len;\r
+\r
+    GpuMat image_spect, templ_spect, result_spect;\r
+    GpuMat image_block, templ_block, result_data;\r
+};\r
+\r
+//! computes the proximity map for the raster template and the image where the template is searched for\r
+CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());\r
+\r
+//! smoothes the source image and downsamples it\r
+CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());\r
+\r
+//! upsamples the source image and then smoothes it\r
+CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());\r
+\r
+//! performs linear blending of two images\r
+//! to avoid accuracy errors sum of weigths shouldn't be very close to zero\r
+CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, \r
+                            GpuMat& result, Stream& stream = Stream::Null());\r
  \r
-        ////////////////////////////// Matrix reductions //////////////////////////////\r
-\r
-        //! computes mean value and standard deviation of all or selected array elements\r
-        //! supports only CV_8UC1 type\r
-        CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);\r
-\r
-        //! computes norm of array\r
-        //! supports NORM_INF, NORM_L1, NORM_L2\r
-        //! supports all matrices except 64F\r
-        CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);\r
-\r
-        //! computes norm of array\r
-        //! supports NORM_INF, NORM_L1, NORM_L2\r
-        //! supports all matrices except 64F\r
-        CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);\r
-\r
-        //! computes norm of the difference between two arrays\r
-        //! supports NORM_INF, NORM_L1, NORM_L2\r
-        //! supports only CV_8UC1 type\r
-        CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);\r
-\r
-        //! computes sum of array elements\r
-        //! supports only single channel images\r
-        CV_EXPORTS Scalar sum(const GpuMat& src);\r
-\r
-        //! computes sum of array elements\r
-        //! supports only single channel images\r
-        CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);\r
-\r
-        //! computes sum of array elements absolute values\r
-        //! supports only single channel images\r
-        CV_EXPORTS Scalar absSum(const GpuMat& src);\r
-\r
-        //! computes sum of array elements absolute values\r
-        //! supports only single channel images\r
-        CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);\r
-\r
-        //! computes squared sum of array elements\r
-        //! supports only single channel images\r
-        CV_EXPORTS Scalar sqrSum(const GpuMat& src);\r
-\r
-        //! computes squared sum of array elements\r
-        //! supports only single channel images\r
-        CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);\r
-\r
-        //! finds global minimum and maximum array elements and returns their values\r
-        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());\r
-\r
-        //! finds global minimum and maximum array elements and returns their values\r
-        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);\r
-\r
-        //! finds global minimum and maximum array elements and returns their values with locations\r
-        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,\r
-                                  const GpuMat& mask=GpuMat());\r
-\r
-        //! finds global minimum and maximum array elements and returns their values with locations\r
-        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,\r
-                                  const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);\r
-\r
-        //! counts non-zero array elements\r
-        CV_EXPORTS int countNonZero(const GpuMat& src);\r
-\r
-        //! counts non-zero array elements\r
-        CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);\r
-\r
-        //! reduces a matrix to a vector\r
-        CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());\r
-\r
-\r
-        ///////////////////////////// Calibration 3D //////////////////////////////////\r
-\r
-        CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,\r
-                                        GpuMat& dst, Stream& stream = Stream::Null());\r
-\r
-        CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,\r
-                                      const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, \r
-                                      Stream& stream = Stream::Null());\r
-\r
-        CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,\r
-                                       const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,\r
-                                       int num_iters=100, float max_dist=8.0, int min_inlier_count=100, \r
-                                       vector<int>* inliers=NULL);\r
-\r
-        //////////////////////////////// Image Labeling ////////////////////////////////\r
-\r
-        //!performs labeling via graph cuts\r
-        CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& stream = Stream::Null());\r
-\r
-        ////////////////////////////////// Histograms //////////////////////////////////\r
-\r
-        //! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.\r
-        CV_EXPORTS void evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel);\r
-        //! Calculates histogram with evenly distributed bins for signle channel source.\r
-        //! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.\r
-        //! Output hist will have one row and histSize cols and CV_32SC1 type.\r
-        CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());\r
-        CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());\r
-        //! Calculates histogram with evenly distributed bins for four-channel source.\r
-        //! All channels of source are processed separately.\r
-        //! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.\r
-        //! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.\r
-        CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());\r
-        CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());\r
-        //! Calculates histogram with bins determined by levels array.\r
-        //! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.\r
-        //! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.\r
-        //! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.\r
-        CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null());\r
-        CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null());\r
-        //! Calculates histogram with bins determined by levels array.\r
-        //! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.\r
-        //! All channels of source are processed separately.\r
-        //! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.\r
-        //! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.\r
-        CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());\r
-        CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream = Stream::Null());\r
-        \r
-        //! Calculates histogram for 8u one channel image\r
-        //! Output hist will have one row, 256 cols and CV32SC1 type.\r
-        CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());\r
-        CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());\r
          \r
-        //! normalizes the grayscale image brightness and contrast by normalizing its histogram\r
-        CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());\r
-        CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());\r
-        CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());\r
-\r
-        //////////////////////////////// StereoBM_GPU ////////////////////////////////\r
-\r
-        class CV_EXPORTS StereoBM_GPU\r
-        {\r
-        public:\r
-            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };\r
-\r
-            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };\r
-\r
-            //! the default constructor\r
-            StereoBM_GPU();\r
-            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.\r
-            StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);\r
-\r
-            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair\r
-            //! Output disparity has CV_8U type.\r
-            void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());\r
-\r
-            //! Some heuristics that tries to estmate\r
-            // if current GPU will be faster than CPU in this algorithm.\r
-            // It queries current active device.\r
-            static bool checkIfGpuCallReasonable();\r
-\r
-            int preset;\r
-            int ndisp;\r
-            int winSize;\r
-\r
-            // If avergeTexThreshold  == 0 => post procesing is disabled\r
-            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image\r
-            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold\r
-            // i.e. input left image is low textured.\r
-            float avergeTexThreshold;\r
-        private:\r
-            GpuMat minSSD, leBuf, riBuf;\r
-        };\r
-\r
-        ////////////////////////// StereoBeliefPropagation ///////////////////////////\r
-        // "Efficient Belief Propagation for Early Vision"\r
-        // P.Felzenszwalb\r
-\r
-        class CV_EXPORTS StereoBeliefPropagation\r
-        {\r
-        public:\r
-            enum { DEFAULT_NDISP  = 64 };\r
-            enum { DEFAULT_ITERS  = 5  };\r
-            enum { DEFAULT_LEVELS = 5  };\r
-\r
-            static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);\r
-\r
-            //! the default constructor\r
-            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,\r
-                int iters  = DEFAULT_ITERS,\r
-                int levels = DEFAULT_LEVELS,\r
-                int msg_type = CV_32F);\r
-\r
-            //! the full constructor taking the number of disparities, number of BP iterations on each level,\r
-            //! number of levels, truncation of data cost, data weight,\r
-            //! truncation of discontinuity cost and discontinuity single jump\r
-            //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term)\r
-            //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term)\r
-            //! please see paper for more details\r
-            StereoBeliefPropagation(int ndisp, int iters, int levels,\r
-                float max_data_term, float data_weight,\r
-                float max_disc_term, float disc_single_jump,\r
-                int msg_type = CV_32F);\r
-\r
-            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,\r
-            //! if disparity is empty output type will be CV_16S else output type will be disparity.type().\r
-            void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());\r
-\r
-\r
-            //! version for user specified data term\r
-            void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());\r
-\r
-            int ndisp;\r
-\r
-            int iters;\r
-            int levels;\r
-\r
-            float max_data_term;\r
-            float data_weight;\r
-            float max_disc_term;\r
-            float disc_single_jump;\r
-\r
-            int msg_type;\r
-        private:\r
-            GpuMat u, d, l, r, u2, d2, l2, r2;\r
-            std::vector<GpuMat> datas;\r
-            GpuMat out;\r
-        };\r
-\r
-        /////////////////////////// StereoConstantSpaceBP ///////////////////////////\r
-        // "A Constant-Space Belief Propagation Algorithm for Stereo Matching"\r
-        // Qingxiong Yang, Liang Wang, Narendra Ahuja\r
-        // http://vision.ai.uiuc.edu/~qyang6/\r
-\r
-        class CV_EXPORTS StereoConstantSpaceBP\r
-        {\r
-        public:\r
-            enum { DEFAULT_NDISP    = 128 };\r
-            enum { DEFAULT_ITERS    = 8   };\r
-            enum { DEFAULT_LEVELS   = 4   };\r
-            enum { DEFAULT_NR_PLANE = 4   };\r
-\r
-            static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);\r
-\r
-            //! the default constructor\r
-            explicit StereoConstantSpaceBP(int ndisp    = DEFAULT_NDISP,\r
-                int iters    = DEFAULT_ITERS,\r
-                int levels   = DEFAULT_LEVELS,\r
-                int nr_plane = DEFAULT_NR_PLANE,\r
-                int msg_type = CV_32F);\r
-\r
-            //! the full constructor taking the number of disparities, number of BP iterations on each level,\r
-            //! number of levels, number of active disparity on the first level, truncation of data cost, data weight,\r
-            //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold\r
-            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,\r
-                float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,\r
-                int min_disp_th = 0,\r
-                int msg_type = CV_32F);\r
-\r
-            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,\r
-            //! if disparity is empty output type will be CV_16S else output type will be disparity.type().\r
-            void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());\r
-\r
-            int ndisp;\r
-\r
-            int iters;\r
-            int levels;\r
-\r
-            int nr_plane;\r
-\r
-            float max_data_term;\r
-            float data_weight;\r
-            float max_disc_term;\r
-            float disc_single_jump;\r
-\r
-            int min_disp_th;\r
-\r
-            int msg_type;\r
-\r
-            bool use_local_init_data_cost;\r
-        private:\r
-            GpuMat u[2], d[2], l[2], r[2];\r
-            GpuMat disp_selected_pyr[2];\r
-\r
-            GpuMat data_cost;\r
-            GpuMat data_cost_selected;\r
-\r
-            GpuMat temp;\r
-\r
-            GpuMat out;\r
-        };\r
-\r
-        /////////////////////////// DisparityBilateralFilter ///////////////////////////\r
-        // Disparity map refinement using joint bilateral filtering given a single color image.\r
-        // Qingxiong Yang, Liang Wang, Narendra Ahuja\r
-        // http://vision.ai.uiuc.edu/~qyang6/\r
-\r
-        class CV_EXPORTS DisparityBilateralFilter\r
-        {\r
-        public:\r
-            enum { DEFAULT_NDISP  = 64 };\r
-            enum { DEFAULT_RADIUS = 3 };\r
-            enum { DEFAULT_ITERS  = 1 };\r
+struct CV_EXPORTS CannyBuf;\r
  \r
-            //! the default constructor\r
-            explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);\r
+CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);\r
+CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);\r
+CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);\r
+CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);\r
  \r
-            //! the full constructor taking the number of disparities, filter radius,\r
-            //! number of iterations, truncation of data continuity, truncation of disparity continuity\r
-            //! and filter range sigma\r
-            DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range);\r
+struct CV_EXPORTS CannyBuf\r
+{\r
+    CannyBuf() {}\r
+    explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}\r
+    CannyBuf(const GpuMat& dx_, const GpuMat& dy_);\r
  \r
-            //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.\r
-            //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.\r
-            void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null());\r
+    void create(const Size& image_size, int apperture_size = 3);\r
+    \r
+    void release();\r
  \r
-        private:\r
-            int ndisp;\r
-            int radius;\r
-            int iters;\r
+    GpuMat dx, dy;\r
+    GpuMat dx_buf, dy_buf;\r
+    GpuMat edgeBuf;\r
+    GpuMat trackBuf1, trackBuf2;\r
+    Ptr<FilterEngine_GPU> filterDX, filterDY;\r
+};\r
  \r
-            float edge_threshold;\r
-            float max_disc_threshold;\r
-            float sigma_range;\r
+////////////////////////////// Matrix reductions //////////////////////////////\r
  \r
-            GpuMat table_color;\r
-            GpuMat table_space;\r
-        };\r
+//! computes mean value and standard deviation of all or selected array elements\r
+//! supports only CV_8UC1 type\r
+CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);\r
+\r
+//! computes norm of array\r
+//! supports NORM_INF, NORM_L1, NORM_L2\r
+//! supports all matrices except 64F\r
+CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);\r
+\r
+//! computes norm of array\r
+//! supports NORM_INF, NORM_L1, NORM_L2\r
+//! supports all matrices except 64F\r
+CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);\r
+\r
+//! computes norm of the difference between two arrays\r
+//! supports NORM_INF, NORM_L1, NORM_L2\r
+//! supports only CV_8UC1 type\r
+CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);\r
+\r
+//! computes sum of array elements\r
+//! supports only single channel images\r
+CV_EXPORTS Scalar sum(const GpuMat& src);\r
+\r
+//! computes sum of array elements\r
+//! supports only single channel images\r
+CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);\r
+\r
+//! computes sum of array elements absolute values\r
+//! supports only single channel images\r
+CV_EXPORTS Scalar absSum(const GpuMat& src);\r
+\r
+//! computes sum of array elements absolute values\r
+//! supports only single channel images\r
+CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);\r
+\r
+//! computes squared sum of array elements\r
+//! supports only single channel images\r
+CV_EXPORTS Scalar sqrSum(const GpuMat& src);\r
+\r
+//! computes squared sum of array elements\r
+//! supports only single channel images\r
+CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);\r
+\r
+//! finds global minimum and maximum array elements and returns their values\r
+CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());\r
+\r
+//! finds global minimum and maximum array elements and returns their values\r
+CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);\r
+\r
+//! finds global minimum and maximum array elements and returns their values with locations\r
+CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,\r
+                          const GpuMat& mask=GpuMat());\r
+\r
+//! finds global minimum and maximum array elements and returns their values with locations\r
+CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,\r
+                          const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);\r
+\r
+//! counts non-zero array elements\r
+CV_EXPORTS int countNonZero(const GpuMat& src);\r
+\r
+//! counts non-zero array elements\r
+CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);\r
+\r
+//! reduces a matrix to a vector\r
+CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());\r
+\r
+\r
+///////////////////////////// Calibration 3D //////////////////////////////////\r
+\r
+CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,\r
+                                GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,\r
+                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, \r
+                              Stream& stream = Stream::Null());\r
+\r
+CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,\r
+                               const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,\r
+                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100, \r
+                               std::vector<int>* inliers=NULL);\r
+\r
+//////////////////////////////// Image Labeling ////////////////////////////////\r
+\r
+//!performs labeling via graph cuts\r
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, \r
+                         GpuMat& buf, Stream& stream = Stream::Null());\r
+\r
+////////////////////////////////// Histograms //////////////////////////////////\r
+\r
+//! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.\r
+CV_EXPORTS void evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel);\r
+//! Calculates histogram with evenly distributed bins for signle channel source.\r
+//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.\r
+//! Output hist will have one row and histSize cols and CV_32SC1 type.\r
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());\r
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());\r
+//! Calculates histogram with evenly distributed bins for four-channel source.\r
+//! All channels of source are processed separately.\r
+//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.\r
+//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.\r
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());\r
+CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());\r
+//! Calculates histogram with bins determined by levels array.\r
+//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.\r
+//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.\r
+//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.\r
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null());\r
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null());\r
+//! Calculates histogram with bins determined by levels array.\r
+//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.\r
+//! All channels of source are processed separately.\r
+//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.\r
+//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.\r
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());\r
+CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream = Stream::Null());\r
+\r
+//! Calculates histogram for 8u one channel image\r
+//! Output hist will have one row, 256 cols and CV32SC1 type.\r
+CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());\r
+CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());\r
+\r
+//! normalizes the grayscale image brightness and contrast by normalizing its histogram\r
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());\r
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());\r
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());\r
+\r
+//////////////////////////////// StereoBM_GPU ////////////////////////////////\r
+\r
+class CV_EXPORTS StereoBM_GPU\r
+{\r
+public:\r
+    enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };\r
  \r
+    enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };\r
  \r
-        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////\r
+    //! the default constructor\r
+    StereoBM_GPU();\r
+    //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.\r
+    StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);\r
  \r
-        struct CV_EXPORTS HOGDescriptor\r
-        {\r
-            enum { DEFAULT_WIN_SIGMA = -1 };\r
-            enum { DEFAULT_NLEVELS = 64 };\r
-            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };\r
+    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair\r
+    //! Output disparity has CV_8U type.\r
+    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());\r
  \r
-            HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),\r
-                          Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),\r
-                          int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,\r
-                          double threshold_L2hys=0.2, bool gamma_correction=true,\r
-                          int nlevels=DEFAULT_NLEVELS);\r
+    //! Some heuristics that tries to estmate\r
+    // if current GPU will be faster than CPU in this algorithm.\r
+    // It queries current active device.\r
+    static bool checkIfGpuCallReasonable();\r
  \r
-            size_t getDescriptorSize() const;\r
-            size_t getBlockHistogramSize() const;\r
+    int preset;\r
+    int ndisp;\r
+    int winSize;\r
  \r
-            void setSVMDetector(const vector<float>& detector);\r
+    // If avergeTexThreshold  == 0 => post procesing is disabled\r
+    // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image\r
+    // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold\r
+    // i.e. input left image is low textured.\r
+    float avergeTexThreshold;\r
  \r
-            static vector<float> getDefaultPeopleDetector();\r
-            static vector<float> getPeopleDetector48x96();\r
-            static vector<float> getPeopleDetector64x128();\r
+private:\r
+    GpuMat minSSD, leBuf, riBuf;\r
+};\r
  \r
-            void detect(const GpuMat& img, vector<Point>& found_locations, \r
-                        double hit_threshold=0, Size win_stride=Size(), \r
-                        Size padding=Size());\r
+////////////////////////// StereoBeliefPropagation ///////////////////////////\r
+// "Efficient Belief Propagation for Early Vision"\r
+// P.Felzenszwalb\r
  \r
-            void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations,\r
-                                  double hit_threshold=0, Size win_stride=Size(), \r
-                                  Size padding=Size(), double scale0=1.05, \r
-                                  int group_threshold=2);\r
+class CV_EXPORTS StereoBeliefPropagation\r
+{\r
+public:\r
+    enum { DEFAULT_NDISP  = 64 };\r
+    enum { DEFAULT_ITERS  = 5  };\r
+    enum { DEFAULT_LEVELS = 5  };\r
+\r
+    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);\r
+\r
+    //! the default constructor\r
+    explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,\r
+                                     int iters  = DEFAULT_ITERS,\r
+                                     int levels = DEFAULT_LEVELS,\r
+                                     int msg_type = CV_32F);\r
+\r
+    //! the full constructor taking the number of disparities, number of BP iterations on each level,\r
+    //! number of levels, truncation of data cost, data weight,\r
+    //! truncation of discontinuity cost and discontinuity single jump\r
+    //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term)\r
+    //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term)\r
+    //! please see paper for more details\r
+    StereoBeliefPropagation(int ndisp, int iters, int levels,\r
+        float max_data_term, float data_weight,\r
+        float max_disc_term, float disc_single_jump,\r
+        int msg_type = CV_32F);\r
+\r
+    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,\r
+    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().\r
+    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());\r
+\r
+\r
+    //! version for user specified data term\r
+    void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());\r
+\r
+    int ndisp;\r
+\r
+    int iters;\r
+    int levels;\r
+\r
+    float max_data_term;\r
+    float data_weight;\r
+    float max_disc_term;\r
+    float disc_single_jump;\r
+\r
+    int msg_type;\r
+private:\r
+    GpuMat u, d, l, r, u2, d2, l2, r2;\r
+    std::vector<GpuMat> datas;\r
+    GpuMat out;\r
+};\r
+\r
+/////////////////////////// StereoConstantSpaceBP ///////////////////////////\r
+// "A Constant-Space Belief Propagation Algorithm for Stereo Matching"\r
+// Qingxiong Yang, Liang Wang, Narendra Ahuja\r
+// http://vision.ai.uiuc.edu/~qyang6/\r
+\r
+class CV_EXPORTS StereoConstantSpaceBP\r
+{\r
+public:\r
+    enum { DEFAULT_NDISP    = 128 };\r
+    enum { DEFAULT_ITERS    = 8   };\r
+    enum { DEFAULT_LEVELS   = 4   };\r
+    enum { DEFAULT_NR_PLANE = 4   };\r
  \r
-            void getDescriptors(const GpuMat& img, Size win_stride, \r
-                                GpuMat& descriptors,\r
-                                int descr_format=DESCR_FORMAT_COL_BY_COL);\r
+    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);\r
  \r
-            Size win_size;\r
-            Size block_size;\r
-            Size block_stride;\r
-            Size cell_size;\r
-            int nbins;\r
-            double win_sigma;\r
-            double threshold_L2hys;\r
-            bool gamma_correction;\r
-            int nlevels;\r
+    //! the default constructor\r
+    explicit StereoConstantSpaceBP(int ndisp    = DEFAULT_NDISP,\r
+                                   int iters    = DEFAULT_ITERS,\r
+                                   int levels   = DEFAULT_LEVELS,\r
+                                   int nr_plane = DEFAULT_NR_PLANE,\r
+                                   int msg_type = CV_32F);\r
  \r
-        protected:\r
-            void computeBlockHistograms(const GpuMat& img);\r
-            void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);\r
+    //! the full constructor taking the number of disparities, number of BP iterations on each level,\r
+    //! number of levels, number of active disparity on the first level, truncation of data cost, data weight,\r
+    //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold\r
+    StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,\r
+        float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,\r
+        int min_disp_th = 0,\r
+        int msg_type = CV_32F);\r
  \r
-            double getWinSigma() const;\r
-            bool checkDetectorSize() const;\r
+    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,\r
+    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().\r
+    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());\r
  \r
-            static int numPartsWithin(int size, int part_size, int stride);\r
-            static Size numPartsWithin(Size size, Size part_size, Size stride);\r
+    int ndisp;\r
  \r
-            // Coefficients of the separating plane\r
-            float free_coef;\r
-            GpuMat detector;\r
-\r
-            // Results of the last classification step\r
-            GpuMat labels, labels_buf;\r
-            Mat labels_host;\r
-\r
-            // Results of the last histogram evaluation step\r
-            GpuMat block_hists, block_hists_buf;\r
-\r
-            // Gradients conputation results\r
-            GpuMat grad, qangle, grad_buf, qangle_buf;\r
-\r
-                       // returns subbuffer with required size, reallocates buffer if nessesary.\r
-                       static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);\r
-                       static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);\r
-\r
-                       std::vector<GpuMat> image_scales;\r
-        };\r
-\r
-\r
-        ////////////////////////////////// BruteForceMatcher //////////////////////////////////\r
-\r
-        class CV_EXPORTS BruteForceMatcher_GPU_base\r
-        {\r
-        public:\r
-            enum DistType {L1Dist = 0, L2Dist, HammingDist};\r
-\r
-            explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);\r
-\r
-            // Add descriptors to train descriptor collection\r
-            void add(const std::vector<GpuMat>& descCollection);\r
-\r
-            // Get train descriptors collection\r
-            const std::vector<GpuMat>& getTrainDescriptors() const;\r
-\r
-            // Clear train descriptors collection\r
-            void clear();\r
-\r
-            // Return true if there are not train descriptors in collection\r
-            bool empty() const;\r
-\r
-            // Return true if the matcher supports mask in match methods\r
-            bool isMaskSupported() const;\r
-\r
-            // Find one best match for each query descriptor\r
-            void matchSingle(const GpuMat& query, const GpuMat& train, \r
-                GpuMat& trainIdx, GpuMat& distance, \r
-                const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());\r
-\r
-            // Download trainIdx and distance and convert it to CPU vector with DMatch\r
-            static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);\r
-            // Convert trainIdx and distance to vector with DMatch\r
-            static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);\r
-\r
-            // Find one best match for each query descriptor\r
-            void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());\r
-\r
-            // Make gpu collection of trains and masks in suitable format for matchCollection function\r
-            void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());\r
-\r
-            // Find one best match from train collection for each query descriptor\r
-            void matchCollection(const GpuMat& query, const GpuMat& trainCollection, \r
-                GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,\r
-                const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());\r
-\r
-            // Download trainIdx, imgIdx and distance and convert it to vector with DMatch\r
-            static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);\r
-            // Convert trainIdx, imgIdx and distance to vector with DMatch\r
-            static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);\r
-\r
-            // Find one best match from train collection for each query descriptor.\r
-            void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());\r
-\r
-            // Find k best matches for each query descriptor (in increasing order of distances)\r
-            void knnMatchSingle(const GpuMat& query, const GpuMat& train,\r
-                GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,\r
-                const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());\r
-\r
-            // Download trainIdx and distance and convert it to vector with DMatch\r
-            // compactResult is used when mask is not empty. If compactResult is false matches\r
-            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
-            // matches vector will not contain matches for fully masked out query descriptors.\r
-            static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-            // Convert trainIdx and distance to vector with DMatch\r
-            static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-\r
-            // Find k best matches for each query descriptor (in increasing order of distances).\r
-            // compactResult is used when mask is not empty. If compactResult is false matches\r
-            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
-            // matches vector will not contain matches for fully masked out query descriptors.\r
-            void knnMatch(const GpuMat& query, const GpuMat& train,\r
-                std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),\r
-                bool compactResult = false);\r
-\r
-            // Find k best matches from train collection for each query descriptor (in increasing order of distances)\r
-            void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,\r
-                GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,\r
-                const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());\r
-\r
-            // Download trainIdx and distance and convert it to vector with DMatch\r
-            // compactResult is used when mask is not empty. If compactResult is false matches\r
-            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
-            // matches vector will not contain matches for fully masked out query descriptors.\r
-            static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-            // Convert trainIdx and distance to vector with DMatch\r
-            static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-\r
-            // Find k best matches  for each query descriptor (in increasing order of distances).\r
-            // compactResult is used when mask is not empty. If compactResult is false matches\r
-            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
-            // matches vector will not contain matches for fully masked out query descriptors.\r
-            void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,\r
-                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);\r
-\r
-            // Find best matches for each query descriptor which have distance less than maxDistance.\r
-            // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.\r
-            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,\r
-            // because it didn't have enough memory.\r
-            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),\r
-            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches\r
-            // Matches doesn't sorted.\r
-            void radiusMatchSingle(const GpuMat& query, const GpuMat& train,\r
-                GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,\r
-                const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());\r
-\r
-            // Download trainIdx, nMatches and distance and convert it to vector with DMatch.\r
-            // matches will be sorted in increasing order of distances.\r
-            // compactResult is used when mask is not empty. If compactResult is false matches\r
-            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
-            // matches vector will not contain matches for fully masked out query descriptors.\r
-            static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-            // Convert trainIdx, nMatches and distance to vector with DMatch.\r
-            static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-\r
-            // Find best matches for each query descriptor which have distance less than maxDistance\r
-            // in increasing order of distances).\r
-            void radiusMatch(const GpuMat& query, const GpuMat& train,\r
-                std::vector< std::vector<DMatch> >& matches, float maxDistance,\r
-                const GpuMat& mask = GpuMat(), bool compactResult = false);\r
-\r
-            // Find best matches for each query descriptor which have distance less than maxDistance.\r
-            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),\r
-            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches\r
-            // Matches doesn't sorted.\r
-            void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,\r
-                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());\r
-\r
-            // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.\r
-            // matches will be sorted in increasing order of distances.\r
-            // compactResult is used when mask is not empty. If compactResult is false matches\r
-            // vector will have the same size as queryDescriptors rows. If compactResult is true\r
-            // matches vector will not contain matches for fully masked out query descriptors.\r
-            static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-            // Convert trainIdx, nMatches and distance to vector with DMatch.\r
-            static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,\r
-                std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
-\r
-            // Find best matches from train collection for each query descriptor which have distance less than\r
-            // maxDistance (in increasing order of distances).\r
-            void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,\r
-                const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);\r
-\r
-            DistType distType;\r
-\r
-        private:\r
-            std::vector<GpuMat> trainDescCollection;\r
-        };\r
-\r
-        template <class Distance>\r
-        class CV_EXPORTS BruteForceMatcher_GPU;\r
-\r
-        template <typename T>\r
-        class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BruteForceMatcher_GPU_base\r
-        {\r
-        public:\r
-            explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L1Dist) {}\r
-            explicit BruteForceMatcher_GPU(L1<T> /*d*/) : BruteForceMatcher_GPU_base(L1Dist) {}\r
-        };\r
-        template <typename T>\r
-        class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BruteForceMatcher_GPU_base\r
-        {\r
-        public:\r
-            explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {}\r
-            explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}\r
-        };\r
-        template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base\r
-        {\r
-        public:\r
-            explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {}\r
-            explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}\r
-        };\r
-\r
-        ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////\r
-        // The cascade classifier class for object detection.\r
-        class CV_EXPORTS CascadeClassifier_GPU\r
-        {\r
-        public:\r
-            CascadeClassifier_GPU();\r
-            CascadeClassifier_GPU(const string& filename);\r
-            ~CascadeClassifier_GPU();\r
-\r
-            bool empty() const;\r
-            bool load(const string& filename);\r
-            void release();\r
-\r
-            /* returns number of detected objects */\r
-            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());\r
-\r
-            bool findLargestObject;\r
-            bool visualizeInPlace;\r
-\r
-            Size getClassifierSize() const;\r
-        private:\r
-\r
-            struct CascadeClassifierImpl;\r
-            CascadeClassifierImpl* impl;\r
-        };\r
-\r
-        ////////////////////////////////// SURF //////////////////////////////////////////\r
-\r
-        class CV_EXPORTS SURF_GPU : public CvSURFParams\r
-        {\r
-        public:\r
-            enum KeypointLayout \r
-            {\r
-                SF_X = 0,\r
-                SF_Y,\r
-                SF_LAPLACIAN,\r
-                SF_SIZE,\r
-                SF_DIR,\r
-                SF_HESSIAN,\r
-                SF_FEATURE_STRIDE\r
-            };\r
-\r
-            //! the default constructor\r
-            SURF_GPU();\r
-            //! the full constructor taking all the necessary parameters\r
-            explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,\r
-                 int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);\r
-\r
-            //! returns the descriptor size in float's (64 or 128)\r
-            int descriptorSize() const;\r
-\r
-            //! upload host keypoints to device memory\r
-            void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);\r
-            //! download keypoints from device to host memory\r
-            void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);\r
-\r
-            //! download descriptors from device to host memory\r
-            void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);\r
-            \r
-            //! finds the keypoints using fast hessian detector used in SURF\r
-            //! supports CV_8UC1 images\r
-            //! keypoints will have nFeature cols and 6 rows\r
-            //! keypoints.ptr<float>(SF_X)[i] will contain x coordinate of i'th feature\r
-            //! keypoints.ptr<float>(SF_Y)[i] will contain y coordinate of i'th feature\r
-            //! keypoints.ptr<float>(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature\r
-            //! keypoints.ptr<float>(SF_SIZE)[i] will contain size of i'th feature\r
-            //! keypoints.ptr<float>(SF_DIR)[i] will contain orientation of i'th feature\r
-            //! keypoints.ptr<float>(SF_HESSIAN)[i] will contain response of i'th feature\r
-            void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);\r
-            //! finds the keypoints and computes their descriptors. \r
-            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction\r
-            void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, \r
-                bool useProvidedKeypoints = false);\r
-\r
-            void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);\r
-            void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, \r
-                bool useProvidedKeypoints = false);\r
-\r
-            void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, \r
-                bool useProvidedKeypoints = false);\r
-\r
-            void releaseMemory();\r
-\r
-            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)\r
-            float keypointsRatio;\r
-\r
-            GpuMat sum, mask1, maskSum, intBuffer;\r
-\r
-            GpuMat det, trace;\r
-\r
-            GpuMat maxPosBuffer;\r
-        };\r
-\r
-        ////////////////////////////////// Optical Flow //////////////////////////////////////////\r
-\r
-        class CV_EXPORTS BroxOpticalFlow\r
-        {\r
-        public:\r
-            BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :\r
-                alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), \r
-                inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)\r
-            {\r
-            }\r
-\r
-            //! Compute optical flow\r
-            //! frame0 - source frame (supports only CV_32FC1 type)\r
-            //! frame1 - frame to track (with the same size and type as frame0)\r
-            //! u      - flow horizontal component (along x axis)\r
-            //! v      - flow vertical component (along y axis)\r
-            void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());\r
-\r
-            //! flow smoothness\r
-               float alpha;\r
-\r
-               //! gradient constancy importance\r
-               float gamma;\r
-\r
-               //! pyramid scale factor\r
-               float scale_factor;\r
-\r
-               //! number of lagged non-linearity iterations (inner loop)\r
-               int inner_iterations;\r
-\r
-               //! number of warping iterations (number of pyramid levels)\r
-               int outer_iterations;\r
-\r
-               //! number of linear system solver iterations\r
-               int solver_iterations;\r
-\r
-            GpuMat buf;\r
-        };\r
-\r
-        //! Interpolate frames (images) using provided optical flow (displacement field).\r
-        //! frame0   - frame 0 (32-bit floating point images, single channel)\r
-        //! frame1   - frame 1 (the same type and size)\r
-        //! fu       - forward horizontal displacement\r
-        //! fv       - forward vertical displacement\r
-        //! bu       - backward horizontal displacement\r
-        //! bv       - backward vertical displacement\r
-        //! pos      - new frame position\r
-        //! newFrame - new frame\r
-        //! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat;\r
-        //!            occlusion masks            0, occlusion masks            1,\r
-        //!            interpolated forward flow  0, interpolated forward flow  1,\r
-        //!            interpolated backward flow 0, interpolated backward flow 1\r
-        //!            \r
-        CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, \r
-            const GpuMat& fu, const GpuMat& fv,\r
-            const GpuMat& bu, const GpuMat& bv, \r
-            float pos, GpuMat& newFrame, GpuMat& buf,\r
-            Stream& stream = Stream::Null());\r
+    int iters;\r
+    int levels;\r
  \r
-    }\r
+    int nr_plane;\r
  \r
-    //! Speckle filtering - filters small connected components on diparity image.\r
-    //! It sets pixel (x,y) to newVal if it coresponds to small CC with size < maxSpeckleSize.\r
-    //! Threshold for border between CC is diffThreshold;\r
-    CV_EXPORTS void filterSpeckles( Mat& img, uchar newVal, int maxSpeckleSize, uchar diffThreshold, Mat& buf);\r
+    float max_data_term;\r
+    float data_weight;\r
+    float max_disc_term;\r
+    float disc_single_jump;\r
  \r
-}\r
-#include "opencv2/gpu/matrix_operations.hpp"\r
+    int min_disp_th;\r
+\r
+    int msg_type;\r
+\r
+    bool use_local_init_data_cost;\r
+private:\r
+    GpuMat u[2], d[2], l[2], r[2];\r
+    GpuMat disp_selected_pyr[2];\r
+\r
+    GpuMat data_cost;\r
+    GpuMat data_cost_selected;\r
+\r
+    GpuMat temp;\r
+\r
+    GpuMat out;\r
+};\r
+\r
+/////////////////////////// DisparityBilateralFilter ///////////////////////////\r
+// Disparity map refinement using joint bilateral filtering given a single color image.\r
+// Qingxiong Yang, Liang Wang, Narendra Ahuja\r
+// http://vision.ai.uiuc.edu/~qyang6/\r
+\r
+class CV_EXPORTS DisparityBilateralFilter\r
+{\r
+public:\r
+    enum { DEFAULT_NDISP  = 64 };\r
+    enum { DEFAULT_RADIUS = 3 };\r
+    enum { DEFAULT_ITERS  = 1 };\r
+\r
+    //! the default constructor\r
+    explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);\r
+\r
+    //! the full constructor taking the number of disparities, filter radius,\r
+    //! number of iterations, truncation of data continuity, truncation of disparity continuity\r
+    //! and filter range sigma\r
+    DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range);\r
+\r
+    //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.\r
+    //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.\r
+    void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null());\r
+\r
+private:\r
+    int ndisp;\r
+    int radius;\r
+    int iters;\r
+\r
+    float edge_threshold;\r
+    float max_disc_threshold;\r
+    float sigma_range;\r
+\r
+    GpuMat table_color;\r
+    GpuMat table_space;\r
+};\r
+\r
+\r
+//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////\r
+\r
+struct CV_EXPORTS HOGDescriptor\r
+{\r
+    enum { DEFAULT_WIN_SIGMA = -1 };\r
+    enum { DEFAULT_NLEVELS = 64 };\r
+    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };\r
+\r
+    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),\r
+                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),\r
+                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,\r
+                  double threshold_L2hys=0.2, bool gamma_correction=true,\r
+                  int nlevels=DEFAULT_NLEVELS);\r
+\r
+    size_t getDescriptorSize() const;\r
+    size_t getBlockHistogramSize() const;\r
+\r
+    void setSVMDetector(const vector<float>& detector);\r
+\r
+    static vector<float> getDefaultPeopleDetector();\r
+    static vector<float> getPeopleDetector48x96();\r
+    static vector<float> getPeopleDetector64x128();\r
+\r
+    void detect(const GpuMat& img, vector<Point>& found_locations, \r
+                double hit_threshold=0, Size win_stride=Size(), \r
+                Size padding=Size());\r
+\r
+    void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations,\r
+                          double hit_threshold=0, Size win_stride=Size(), \r
+                          Size padding=Size(), double scale0=1.05, \r
+                          int group_threshold=2);\r
+\r
+    void getDescriptors(const GpuMat& img, Size win_stride, \r
+                        GpuMat& descriptors,\r
+                        int descr_format=DESCR_FORMAT_COL_BY_COL);\r
+\r
+    Size win_size;\r
+    Size block_size;\r
+    Size block_stride;\r
+    Size cell_size;\r
+    int nbins;\r
+    double win_sigma;\r
+    double threshold_L2hys;\r
+    bool gamma_correction;\r
+    int nlevels;\r
+\r
+protected:\r
+    void computeBlockHistograms(const GpuMat& img);\r
+    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);\r
+\r
+    double getWinSigma() const;\r
+    bool checkDetectorSize() const;\r
+\r
+    static int numPartsWithin(int size, int part_size, int stride);\r
+    static Size numPartsWithin(Size size, Size part_size, Size stride);\r
+\r
+    // Coefficients of the separating plane\r
+    float free_coef;\r
+    GpuMat detector;\r
+\r
+    // Results of the last classification step\r
+    GpuMat labels, labels_buf;\r
+    Mat labels_host;\r
+\r
+    // Results of the last histogram evaluation step\r
+    GpuMat block_hists, block_hists_buf;\r
+\r
+    // Gradients conputation results\r
+    GpuMat grad, qangle, grad_buf, qangle_buf;\r
+\r
+       // returns subbuffer with required size, reallocates buffer if nessesary.\r
+       static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);\r
+       static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);\r
+\r
+       std::vector<GpuMat> image_scales;\r
+};\r
+\r
+\r
+////////////////////////////////// BruteForceMatcher //////////////////////////////////\r
+\r
+class CV_EXPORTS BruteForceMatcher_GPU_base\r
+{\r
+public:\r
+    enum DistType {L1Dist = 0, L2Dist, HammingDist};\r
+\r
+    explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);\r
+\r
+    // Add descriptors to train descriptor collection\r
+    void add(const std::vector<GpuMat>& descCollection);\r
+\r
+    // Get train descriptors collection\r
+    const std::vector<GpuMat>& getTrainDescriptors() const;\r
+\r
+    // Clear train descriptors collection\r
+    void clear();\r
+\r
+    // Return true if there are not train descriptors in collection\r
+    bool empty() const;\r
+\r
+    // Return true if the matcher supports mask in match methods\r
+    bool isMaskSupported() const;\r
+\r
+    // Find one best match for each query descriptor\r
+    void matchSingle(const GpuMat& query, const GpuMat& train, \r
+        GpuMat& trainIdx, GpuMat& distance, \r
+        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());\r
+\r
+    // Download trainIdx and distance and convert it to CPU vector with DMatch\r
+    static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);\r
+    // Convert trainIdx and distance to vector with DMatch\r
+    static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);\r
+\r
+    // Find one best match for each query descriptor\r
+    void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());\r
+\r
+    // Make gpu collection of trains and masks in suitable format for matchCollection function\r
+    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());\r
+\r
+    // Find one best match from train collection for each query descriptor\r
+    void matchCollection(const GpuMat& query, const GpuMat& trainCollection, \r
+        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,\r
+        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());\r
+\r
+    // Download trainIdx, imgIdx and distance and convert it to vector with DMatch\r
+    static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);\r
+    // Convert trainIdx, imgIdx and distance to vector with DMatch\r
+    static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);\r
+\r
+    // Find one best match from train collection for each query descriptor.\r
+    void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());\r
+\r
+    // Find k best matches for each query descriptor (in increasing order of distances)\r
+    void knnMatchSingle(const GpuMat& query, const GpuMat& train,\r
+        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,\r
+        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());\r
+\r
+    // Download trainIdx and distance and convert it to vector with DMatch\r
+    // compactResult is used when mask is not empty. If compactResult is false matches\r
+    // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+    // matches vector will not contain matches for fully masked out query descriptors.\r
+    static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+    // Convert trainIdx and distance to vector with DMatch\r
+    static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+\r
+    // Find k best matches for each query descriptor (in increasing order of distances).\r
+    // compactResult is used when mask is not empty. If compactResult is false matches\r
+    // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+    // matches vector will not contain matches for fully masked out query descriptors.\r
+    void knnMatch(const GpuMat& query, const GpuMat& train,\r
+        std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),\r
+        bool compactResult = false);\r
+\r
+    // Find k best matches from train collection for each query descriptor (in increasing order of distances)\r
+    void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,\r
+        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,\r
+        const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());\r
+\r
+    // Download trainIdx and distance and convert it to vector with DMatch\r
+    // compactResult is used when mask is not empty. If compactResult is false matches\r
+    // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+    // matches vector will not contain matches for fully masked out query descriptors.\r
+    static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+    // Convert trainIdx and distance to vector with DMatch\r
+    static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+\r
+    // Find k best matches  for each query descriptor (in increasing order of distances).\r
+    // compactResult is used when mask is not empty. If compactResult is false matches\r
+    // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+    // matches vector will not contain matches for fully masked out query descriptors.\r
+    void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,\r
+        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);\r
+\r
+    // Find best matches for each query descriptor which have distance less than maxDistance.\r
+    // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.\r
+    // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,\r
+    // because it didn't have enough memory.\r
+    // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),\r
+    // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches\r
+    // Matches doesn't sorted.\r
+    void radiusMatchSingle(const GpuMat& query, const GpuMat& train,\r
+        GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,\r
+        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());\r
+\r
+    // Download trainIdx, nMatches and distance and convert it to vector with DMatch.\r
+    // matches will be sorted in increasing order of distances.\r
+    // compactResult is used when mask is not empty. If compactResult is false matches\r
+    // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+    // matches vector will not contain matches for fully masked out query descriptors.\r
+    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+    // Convert trainIdx, nMatches and distance to vector with DMatch.\r
+    static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+\r
+    // Find best matches for each query descriptor which have distance less than maxDistance\r
+    // in increasing order of distances).\r
+    void radiusMatch(const GpuMat& query, const GpuMat& train,\r
+        std::vector< std::vector<DMatch> >& matches, float maxDistance,\r
+        const GpuMat& mask = GpuMat(), bool compactResult = false);\r
+\r
+    // Find best matches for each query descriptor which have distance less than maxDistance.\r
+    // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),\r
+    // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches\r
+    // Matches doesn't sorted.\r
+    void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,\r
+        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());\r
+\r
+    // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.\r
+    // matches will be sorted in increasing order of distances.\r
+    // compactResult is used when mask is not empty. If compactResult is false matches\r
+    // vector will have the same size as queryDescriptors rows. If compactResult is true\r
+    // matches vector will not contain matches for fully masked out query descriptors.\r
+    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+    // Convert trainIdx, nMatches and distance to vector with DMatch.\r
+    static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,\r
+        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);\r
+\r
+    // Find best matches from train collection for each query descriptor which have distance less than\r
+    // maxDistance (in increasing order of distances).\r
+    void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,\r
+        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);\r
+\r
+    DistType distType;\r
+\r
+private:\r
+    std::vector<GpuMat> trainDescCollection;\r
+};\r
+\r
+template <class Distance>\r
+class CV_EXPORTS BruteForceMatcher_GPU;\r
+\r
+template <typename T>\r
+class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BruteForceMatcher_GPU_base\r
+{\r
+public:\r
+    explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L1Dist) {}\r
+    explicit BruteForceMatcher_GPU(L1<T> /*d*/) : BruteForceMatcher_GPU_base(L1Dist) {}\r
+};\r
+template <typename T>\r
+class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BruteForceMatcher_GPU_base\r
+{\r
+public:\r
+    explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {}\r
+    explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}\r
+};\r
+template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base\r
+{\r
+public:\r
+    explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {}\r
+    explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}\r
+};\r
+\r
+////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////\r
+// The cascade classifier class for object detection.\r
+class CV_EXPORTS CascadeClassifier_GPU\r
+{\r
+public:\r
+    CascadeClassifier_GPU();\r
+    CascadeClassifier_GPU(const std::string& filename);\r
+    ~CascadeClassifier_GPU();\r
+\r
+    bool empty() const;\r
+    bool load(const std::string& filename);\r
+    void release();\r
+\r
+    /* returns number of detected objects */\r
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());\r
+\r
+    bool findLargestObject;\r
+    bool visualizeInPlace;\r
+\r
+    Size getClassifierSize() const;\r
+private:\r
+\r
+    struct CascadeClassifierImpl;\r
+    CascadeClassifierImpl* impl;\r
+};\r
+\r
+////////////////////////////////// SURF //////////////////////////////////////////\r
+\r
+class CV_EXPORTS SURF_GPU : public CvSURFParams\r
+{\r
+public:\r
+    enum KeypointLayout \r
+    {\r
+        SF_X = 0,\r
+        SF_Y,\r
+        SF_LAPLACIAN,\r
+        SF_SIZE,\r
+        SF_DIR,\r
+        SF_HESSIAN,\r
+        SF_FEATURE_STRIDE\r
+    };\r
+\r
+    //! the default constructor\r
+    SURF_GPU();\r
+    //! the full constructor taking all the necessary parameters\r
+    explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,\r
+         int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);\r
+\r
+    //! returns the descriptor size in float's (64 or 128)\r
+    int descriptorSize() const;\r
+\r
+    //! upload host keypoints to device memory\r
+    void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);\r
+    //! download keypoints from device to host memory\r
+    void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);\r
+\r
+    //! download descriptors from device to host memory\r
+    void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);\r
+    \r
+    //! finds the keypoints using fast hessian detector used in SURF\r
+    //! supports CV_8UC1 images\r
+    //! keypoints will have nFeature cols and 6 rows\r
+    //! keypoints.ptr<float>(SF_X)[i] will contain x coordinate of i'th feature\r
+    //! keypoints.ptr<float>(SF_Y)[i] will contain y coordinate of i'th feature\r
+    //! keypoints.ptr<float>(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature\r
+    //! keypoints.ptr<float>(SF_SIZE)[i] will contain size of i'th feature\r
+    //! keypoints.ptr<float>(SF_DIR)[i] will contain orientation of i'th feature\r
+    //! keypoints.ptr<float>(SF_HESSIAN)[i] will contain response of i'th feature\r
+    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);\r
+    //! finds the keypoints and computes their descriptors. \r
+    //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction\r
+    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, \r
+        bool useProvidedKeypoints = false);\r
+\r
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);\r
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, \r
+        bool useProvidedKeypoints = false);\r
+\r
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, \r
+        bool useProvidedKeypoints = false);\r
+\r
+    void releaseMemory();\r
+\r
+    //! max keypoints = min(keypointsRatio * img.size().area(), 65535)\r
+    float keypointsRatio;\r
+\r
+    GpuMat sum, mask1, maskSum, intBuffer;\r
+\r
+    GpuMat det, trace;\r
+\r
+    GpuMat maxPosBuffer;\r
+};\r
+\r
+////////////////////////////////// Optical Flow //////////////////////////////////////////\r
+\r
+class CV_EXPORTS BroxOpticalFlow\r
+{\r
+public:\r
+    BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :\r
+        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), \r
+        inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)\r
+    {\r
+    }\r
+\r
+    //! Compute optical flow\r
+    //! frame0 - source frame (supports only CV_32FC1 type)\r
+    //! frame1 - frame to track (with the same size and type as frame0)\r
+    //! u      - flow horizontal component (along x axis)\r
+    //! v      - flow vertical component (along y axis)\r
+    void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());\r
+\r
+    //! flow smoothness\r
+    float alpha;\r
+\r
+    //! gradient constancy importance\r
+    float gamma;\r
+\r
+    //! pyramid scale factor\r
+    float scale_factor;\r
+\r
+    //! number of lagged non-linearity iterations (inner loop)\r
+    int inner_iterations;\r
+\r
+    //! number of warping iterations (number of pyramid levels)\r
+    int outer_iterations;\r
+\r
+    //! number of linear system solver iterations\r
+    int solver_iterations;\r
+\r
+    GpuMat buf;\r
+};\r
+\r
+//! Interpolate frames (images) using provided optical flow (displacement field).\r
+//! frame0   - frame 0 (32-bit floating point images, single channel)\r
+//! frame1   - frame 1 (the same type and size)\r
+//! fu       - forward horizontal displacement\r
+//! fv       - forward vertical displacement\r
+//! bu       - backward horizontal displacement\r
+//! bv       - backward vertical displacement\r
+//! pos      - new frame position\r
+//! newFrame - new frame\r
+//! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat;\r
+//!            occlusion masks            0, occlusion masks            1,\r
+//!            interpolated forward flow  0, interpolated forward flow  1,\r
+//!            interpolated backward flow 0, interpolated backward flow 1\r
+//!            \r
+CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, \r
+                                  const GpuMat& fu, const GpuMat& fv,\r
+                                  const GpuMat& bu, const GpuMat& bv, \r
+                                  float pos, GpuMat& newFrame, GpuMat& buf,\r
+                                  Stream& stream = Stream::Null());\r
+\r
+} // namespace gpu\r
+\r
+//! Speckle filtering - filters small connected components on diparity image.\r
+//! It sets pixel (x,y) to newVal if it coresponds to small CC with size < maxSpeckleSize.\r
+//! Threshold for border between CC is diffThreshold;\r
+CV_EXPORTS void filterSpeckles(Mat& img, uchar newVal, int maxSpeckleSize, uchar diffThreshold, Mat& buf);\r
+\r
+} // namespace cv\r
  \r
  #endif /* __OPENCV_GPU_HPP__ */\r
diff --git a/modules/gpu/include/opencv2/gpu/gpumat.hpp b/modules/gpu/include/opencv2/gpu/gpumat.hpp

index e36a94a..3baff61 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/gpumat.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpumat.hpp
@@ -40,427 +40,4 @@
  //\r
  //M*/\r
  \r
-#ifndef __OPENCV_GPUMAT_HPP__\r
-#define __OPENCV_GPUMAT_HPP__\r
-\r
-#include "opencv2/core/core.hpp"\r
-#include "opencv2/gpu/devmem2d.hpp"\r
-\r
-namespace cv { namespace gpu\r
-{\r
-    //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.\r
-    class CV_EXPORTS GpuMat\r
-    {\r
-    public:\r
-        //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.\r
-        // Contains just image size, data ptr and step.\r
-        template <class T> operator DevMem2D_<T>() const;\r
-        template <class T> operator PtrStep_<T>() const;\r
-               template <class T> operator PtrStep<T>() const;\r
-\r
-\r
-\r
-\r
-\r
-        //! builds GpuMat from Mat. Perfom blocking upload to device.\r
-        explicit GpuMat(const Mat& m);\r
-\r
-        //! pefroms blocking upload data to GpuMat.\r
-        void upload(const Mat& m);\r
-\r
-        //! downloads data from device to host memory. Blocking calls.\r
-        void download(Mat& m) const;\r
-        operator Mat() const\r
-        {\r
-            Mat m;\r
-            download(m);\r
-            return m;\r
-        }\r
-\r
-\r
-\r
-\r
-\r
-\r
-        //! default constructor\r
-        GpuMat();\r
-\r
-        //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)\r
-        GpuMat(int rows, int cols, int type);\r
-        GpuMat(Size size, int type);\r
-\r
-        //! constucts GpuMatrix and fills it with the specified value _s.\r
-        GpuMat(int rows, int cols, int type, const Scalar& s);\r
-        GpuMat(Size size, int type, const Scalar& s);\r
-\r
-        //! copy constructor\r
-        GpuMat(const GpuMat& m);\r
-\r
-        //! constructor for GpuMatrix headers pointing to user-allocated data\r
-        GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);\r
-        GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);\r
-\r
-        //! creates a matrix header for a part of the bigger matrix\r
-        GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);\r
-        GpuMat(const GpuMat& m, const Rect& roi);\r
-\r
-        //! destructor - calls release()\r
-        ~GpuMat();\r
-\r
-        //! assignment operators\r
-        GpuMat& operator = (const GpuMat& m);\r
-\r
-        //! returns a new GpuMatrix header for the specified row\r
-        GpuMat row(int y) const;\r
-        //! returns a new GpuMatrix header for the specified column\r
-        GpuMat col(int x) const;\r
-        //! ... for the specified row span\r
-        GpuMat rowRange(int startrow, int endrow) const;\r
-        GpuMat rowRange(const Range& r) const;\r
-        //! ... for the specified column span\r
-        GpuMat colRange(int startcol, int endcol) const;\r
-        GpuMat colRange(const Range& r) const;\r
-\r
-        //! returns deep copy of the GpuMatrix, i.e. the data is copied\r
-        GpuMat clone() const;\r
-        //! copies the GpuMatrix content to "m".\r
-        // It calls m.create(this->size(), this->type()).\r
-        void copyTo(GpuMat& m) const;\r
-        //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.\r
-        void copyTo(GpuMat& m, const GpuMat& mask) const;\r
-        //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.\r
-        void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;\r
-\r
-        void assignTo(GpuMat& m, int type=-1) const;\r
-\r
-        //! sets every GpuMatrix element to s\r
-        GpuMat& operator = (const Scalar& s);\r
-        //! sets some of the GpuMatrix elements to s, according to the mask\r
-        GpuMat& setTo(const Scalar& s, const GpuMat& mask = GpuMat());\r
-        //! creates alternative GpuMatrix header for the same data, with different\r
-        // number of channels and/or different number of rows. see cvReshape.\r
-        GpuMat reshape(int cn, int rows = 0) const;\r
-\r
-        //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.\r
-        // previous data is unreferenced if needed.\r
-        void create(int rows, int cols, int type);\r
-        void create(Size size, int type);\r
-        //! decreases reference counter;\r
-        // deallocate the data when reference counter reaches 0.\r
-        void release();\r
-\r
-        //! swaps with other smart pointer\r
-        void swap(GpuMat& mat);\r
-\r
-        //! locates GpuMatrix header within a parent GpuMatrix. See below\r
-        void locateROI(Size& wholeSize, Point& ofs) const;\r
-        //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.\r
-        GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);\r
-        //! extracts a rectangular sub-GpuMatrix\r
-        // (this is a generalized form of row, rowRange etc.)\r
-        GpuMat operator()(Range rowRange, Range colRange) const;\r
-        GpuMat operator()(const Rect& roi) const;\r
-\r
-        //! returns true iff the GpuMatrix data is continuous\r
-        // (i.e. when there are no gaps between successive rows).\r
-        // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)\r
-        bool isContinuous() const;\r
-        //! returns element size in bytes,\r
-        // similar to CV_ELEM_SIZE(cvMat->type)\r
-        size_t elemSize() const;\r
-        //! returns the size of element channel in bytes.\r
-        size_t elemSize1() const;\r
-        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)\r
-        int type() const;\r
-        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)\r
-        int depth() const;\r
-        //! returns element type, similar to CV_MAT_CN(cvMat->type)\r
-        int channels() const;\r
-        //! returns step/elemSize1()\r
-        size_t step1() const;\r
-        //! returns GpuMatrix size:\r
-        // width == number of columns, height == number of rows\r
-        Size size() const;\r
-        //! returns true if GpuMatrix data is NULL\r
-        bool empty() const;\r
-\r
-        //! returns pointer to y-th row\r
-        uchar* ptr(int y = 0);\r
-        const uchar* ptr(int y = 0) const;\r
-\r
-        //! template version of the above method\r
-        template<typename _Tp> _Tp* ptr(int y = 0);\r
-        template<typename _Tp> const _Tp* ptr(int y = 0) const;\r
-\r
-        /*! includes several bit-fields:\r
-        - the magic signature\r
-        - continuity flag\r
-        - depth\r
-        - number of channels\r
-        */\r
-        int flags;\r
-\r
-        //! the number of rows and columns\r
-        int rows, cols;\r
-\r
-        //! a distance between successive rows in bytes; includes the gap if any\r
-        size_t step;\r
-\r
-        //! pointer to the data\r
-        uchar* data;\r
-\r
-        //! pointer to the reference counter;\r
-        // when GpuMatrix points to user-allocated data, the pointer is NULL\r
-        int* refcount;\r
-\r
-        //! helper fields used in locateROI and adjustROI\r
-        uchar* datastart;\r
-        uchar* dataend;\r
-    };\r
-\r
-    //! Creates continuous GPU matrix\r
-    CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);\r
-    CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);\r
-    CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);\r
-    CV_EXPORTS GpuMat createContinuous(Size size, int type);\r
-\r
-    //! Ensures that size of the given matrix is not less than (rows, cols) size\r
-    //! and matrix type is match specified one too\r
-    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);\r
-    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);\r
-\r
-    ////////////////////////////////////////////////////////////////////////\r
-\r
-    template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); }\r
-    template <class T> inline GpuMat::operator PtrStep_<T>() const { return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); }\r
-       template <class T> inline GpuMat::operator PtrStep<T>() const { return PtrStep<T>((T*)data, step); }    \r
-\r
-\r
-\r
-\r
-\r
-\r
-    inline GpuMat::GpuMat() \r
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) \r
-    {\r
-    }\r
-\r
-    inline GpuMat::GpuMat(int rows_, int cols_, int type_) \r
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
-    {\r
-        if (rows_ > 0 && cols_ > 0)\r
-            create(rows_, cols_, type_);\r
-    }\r
-\r
-    inline GpuMat::GpuMat(Size size_, int type_) \r
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
-    {\r
-        if (size_.height > 0 && size_.width > 0)\r
-            create(size_.height, size_.width, type_);\r
-    }\r
-\r
-    inline GpuMat::GpuMat(int rows_, int cols_, int type_, const Scalar& s_) \r
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
-    {\r
-        if (rows_ > 0 && cols_ > 0)\r
-        {\r
-            create(rows_, cols_, type_);\r
-            setTo(s_);\r
-        }\r
-    }\r
-\r
-    inline GpuMat::GpuMat(Size size_, int type_, const Scalar& s_) \r
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)\r
-    {\r
-        if (size_.height > 0 && size_.width > 0)\r
-        {\r
-            create(size_.height, size_.width, type_);\r
-            setTo(s_);\r
-        }\r
-    }\r
-\r
-    inline GpuMat::~GpuMat() \r
-    { \r
-        release(); \r
-    }\r
-\r
-    inline GpuMat GpuMat::clone() const\r
-    {\r
-        GpuMat m;\r
-        copyTo(m);\r
-        return m;\r
-    }\r
-\r
-    inline void GpuMat::assignTo(GpuMat& m, int type) const\r
-    {\r
-        if (type < 0)\r
-            m = *this;\r
-        else\r
-            convertTo(m, type);\r
-    }\r
-\r
-    inline size_t GpuMat::step1() const \r
-    { \r
-        return step / elemSize1(); \r
-    }\r
-\r
-    inline bool GpuMat::empty() const \r
-    { \r
-        return data == 0; \r
-    }\r
-\r
-    template<typename _Tp> inline _Tp* GpuMat::ptr(int y)\r
-    {\r
-        return (_Tp*)ptr(y);\r
-    }\r
-\r
-    template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const\r
-    {\r
-        return (const _Tp*)ptr(y);\r
-    }\r
-\r
-    inline void swap(GpuMat& a, GpuMat& b) \r
-    { \r
-        a.swap(b); \r
-    }\r
-\r
-    inline GpuMat GpuMat::row(int y) const \r
-    { \r
-        return GpuMat(*this, Range(y, y+1), Range::all()); \r
-    }\r
-\r
-    inline GpuMat GpuMat::col(int x) const \r
-    { \r
-        return GpuMat(*this, Range::all(), Range(x, x+1)); \r
-    }\r
-\r
-    inline GpuMat GpuMat::rowRange(int startrow, int endrow) const \r
-    { \r
-        return GpuMat(*this, Range(startrow, endrow), Range::all()); \r
-    }\r
-\r
-    inline GpuMat GpuMat::rowRange(const Range& r) const \r
-    { \r
-        return GpuMat(*this, r, Range::all()); \r
-    }\r
-\r
-    inline GpuMat GpuMat::colRange(int startcol, int endcol) const \r
-    { \r
-        return GpuMat(*this, Range::all(), Range(startcol, endcol)); \r
-    }\r
-\r
-    inline GpuMat GpuMat::colRange(const Range& r) const \r
-    { \r
-        return GpuMat(*this, Range::all(), r); \r
-    }\r
-\r
-    inline void GpuMat::create(Size size_, int type_) \r
-    { \r
-        create(size_.height, size_.width, type_); \r
-    }\r
-\r
-    inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const \r
-    { \r
-        return GpuMat(*this, rowRange, colRange); \r
-    }\r
-\r
-    inline GpuMat GpuMat::operator()(const Rect& roi) const \r
-    { \r
-        return GpuMat(*this, roi); \r
-    }\r
-\r
-    inline bool GpuMat::isContinuous() const \r
-    { \r
-        return (flags & Mat::CONTINUOUS_FLAG) != 0; \r
-    }\r
-\r
-    inline size_t GpuMat::elemSize() const \r
-    { \r
-        return CV_ELEM_SIZE(flags); \r
-    }\r
-\r
-    inline size_t GpuMat::elemSize1() const \r
-    { \r
-        return CV_ELEM_SIZE1(flags); \r
-    }\r
-\r
-    inline int GpuMat::type() const \r
-    { \r
-        return CV_MAT_TYPE(flags); \r
-    }\r
-\r
-    inline int GpuMat::depth() const \r
-    { \r
-        return CV_MAT_DEPTH(flags); \r
-    }\r
-\r
-    inline int GpuMat::channels() const \r
-    { \r
-        return CV_MAT_CN(flags); \r
-    }\r
-\r
-    inline Size GpuMat::size() const \r
-    { \r
-        return Size(cols, rows); \r
-    }\r
-\r
-    inline unsigned char* GpuMat::ptr(int y)\r
-    {\r
-        CV_DbgAssert((unsigned)y < (unsigned)rows);\r
-        return data + step * y;\r
-    }\r
-\r
-    inline const unsigned char* GpuMat::ptr(int y) const\r
-    {\r
-        CV_DbgAssert((unsigned)y < (unsigned)rows);\r
-        return data + step * y;\r
-    }\r
-\r
-    inline GpuMat& GpuMat::operator = (const Scalar& s)\r
-    {\r
-        setTo(s);\r
-        return *this;\r
-    }\r
-\r
-    inline GpuMat createContinuous(int rows, int cols, int type)\r
-    {\r
-        GpuMat m;\r
-        createContinuous(rows, cols, type, m);\r
-        return m;\r
-    }\r
-\r
-    inline void createContinuous(Size size, int type, GpuMat& m)\r
-    {\r
-        createContinuous(size.height, size.width, type, m);\r
-    }\r
-\r
-    inline GpuMat createContinuous(Size size, int type)\r
-    {\r
-        GpuMat m;\r
-        createContinuous(size, type, m);\r
-        return m;\r
-    }\r
-\r
-    inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)\r
-    {\r
-        ensureSizeIsEnough(size.height, size.width, type, m);\r
-    }\r
-\r
-    inline void createContinuous(int rows, int cols, int type, GpuMat& m)\r
-    {\r
-        int area = rows * cols;\r
-        if (!m.isContinuous() || m.type() != type || m.size().area() != area)\r
-            m.create(1, area, type);\r
-        m = m.reshape(0, rows);\r
-    }\r
-\r
-    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)\r
-    {\r
-        if (m.type() == type && m.rows >= rows && m.cols >= cols)\r
-            m = m(Rect(0, 0, cols, rows));\r
-        else\r
-            m.create(rows, cols, type);\r
-    }\r
-}}\r
-\r
-#endif // __OPENCV_GPUMAT_HPP__\r
+#include "opencv2/core/gpumat.hpp"\r
diff --git a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp

deleted file mode 100644 (file)

index 5a6b1bb..0000000
--- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////\r
-//\r
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
-//\r
-//  By downloading, copying, installing or using the software you agree to this license.\r
-//  If you do not agree to this license, do not download, install,\r
-//  copy or use the software.\r
-//\r
-//\r
-//                           License Agreement\r
-//                For Open Source Computer Vision Library\r
-//\r
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
-// Third party copyrights are property of their respective owners.\r
-//\r
-// Redistribution and use in source and binary forms, with or without modification,\r
-// are permitted provided that the following conditions are met:\r
-//\r
-//   * Redistribution's of source code must retain the above copyright notice,\r
-//     this list of conditions and the following disclaimer.\r
-//\r
-//   * Redistribution's in binary form must reproduce the above copyright notice,\r
-//     this list of conditions and the following disclaimer in the documentation\r
-//     and/or other GpuMaterials provided with the distribution.\r
-//\r
-//   * The name of the copyright holders may not be used to endorse or promote products\r
-//     derived from this software without specific prior written permission.\r
-//\r
-// This software is provided by the copyright holders and contributors "as is" and\r
-// any express or implied warranties, including, but not limited to, the implied\r
-// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
-// In no event shall the Intel Corporation or contributors be liable for any direct,\r
-// indirect, incidental, special, exemplary, or consequential damages\r
-// (including, but not limited to, procurement of substitute goods or services;\r
-// loss of use, data, or profits; or business interruption) however caused\r
-// and on any theory of liability, whether in contract, strict liability,\r
-// or tort (including negligence or otherwise) arising in any way out of\r
-// the use of this software, even if advised of the possibility of such damage.\r
-//\r
-//M*/\r
-\r
-#ifndef __OPENCV_GPU_MATRIX_OPERATIONS_HPP__\r
-#define __OPENCV_GPU_MATRIX_OPERATIONS_HPP__\r
-\r
-namespace cv\r
-{\r
-\r
-namespace gpu\r
-{\r
-///////////////////////////////////////////////////////////////////////\r
-//////////////////////////////// CudaMem ////////////////////////////////\r
-///////////////////////////////////////////////////////////////////////\r
-\r
-inline CudaMem::CudaMem()  : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) {}\r
-inline CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)\r
-{\r
-    if( _rows > 0 && _cols > 0 )\r
-        create( _rows, _cols, _type, _alloc_type);\r
-}\r
-\r
-inline CudaMem::CudaMem(Size _size, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)\r
-{\r
-    if( _size.height > 0 && _size.width > 0 )\r
-        create( _size.height, _size.width, _type, _alloc_type);\r
-}\r
-\r
-inline CudaMem::CudaMem(const CudaMem& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)\r
-{\r
-    if( refcount )\r
-        CV_XADD(refcount, 1);\r
-}\r
-\r
-inline CudaMem::CudaMem(const Mat& m, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)\r
-{\r
-    if( m.rows > 0 && m.cols > 0 )\r
-        create( m.size(), m.type(), _alloc_type);\r
-\r
-    Mat tmp = createMatHeader();\r
-    m.copyTo(tmp);\r
-}\r
-\r
-inline CudaMem::~CudaMem()\r
-{\r
-    release();\r
-\r
-}\r
-\r
-inline CudaMem& CudaMem::operator = (const CudaMem& m)\r
-{\r
-    if( this != &m )\r
-    {\r
-        if( m.refcount )\r
-            CV_XADD(m.refcount, 1);\r
-        release();\r
-        flags = m.flags;\r
-        rows = m.rows; cols = m.cols;\r
-        step = m.step; data = m.data;\r
-        datastart = m.datastart;\r
-        dataend = m.dataend;\r
-        refcount = m.refcount;\r
-        alloc_type = m.alloc_type;\r
-    }\r
-    return *this;\r
-}\r
-\r
-inline CudaMem CudaMem::clone() const\r
-{\r
-    CudaMem m(size(), type(), alloc_type);\r
-    Mat to = m;\r
-    Mat from = *this;\r
-    from.copyTo(to);\r
-    return m;\r
-}\r
-\r
-inline void CudaMem::create(Size _size, int _type, int _alloc_type) { create(_size.height, _size.width, _type, _alloc_type); }\r
-\r
-\r
-//CCP void CudaMem::create(int _rows, int _cols, int _type, int _alloc_type);\r
-//CPP void CudaMem::release();\r
-\r
-inline Mat CudaMem::createMatHeader() const { return Mat(size(), type(), data, step); }\r
-inline CudaMem::operator Mat() const { return createMatHeader(); }\r
-\r
-inline CudaMem::operator GpuMat() const { return createGpuMatHeader(); }\r
-//CPP GpuMat CudaMem::createGpuMatHeader() const;\r
-\r
-inline bool CudaMem::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }\r
-inline size_t CudaMem::elemSize() const { return CV_ELEM_SIZE(flags); }\r
-inline size_t CudaMem::elemSize1() const { return CV_ELEM_SIZE1(flags); }\r
-inline int CudaMem::type() const { return CV_MAT_TYPE(flags); }\r
-inline int CudaMem::depth() const { return CV_MAT_DEPTH(flags); }\r
-inline int CudaMem::channels() const { return CV_MAT_CN(flags); }\r
-inline size_t CudaMem::step1() const { return step/elemSize1(); }\r
-inline Size CudaMem::size() const { return Size(cols, rows); }\r
-inline bool CudaMem::empty() const { return data == 0; }\r
-\r
-} /* end of namespace gpu */\r
-\r
-} /* end of namespace cv */\r
-\r
-#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */\r
diff --git a/modules/gpu/perf/perf_arithm.cpp b/modules/gpu/perf/perf_arithm.cpp

index 8e34023..d740388 100644 (file)
--- a/modules/gpu/perf/perf_arithm.cpp
+++ b/modules/gpu/perf/perf_arithm.cpp
@@ -24,7 +24,7 @@ PERF_TEST_P(DevInfo_Size_MatType, transpose, testing::Combine(testing::ValuesIn(
          transpose(src, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -55,7 +55,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, flip, testing::Combine(testing::Value
          flip(src, dst, flipCode);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_Size_MatType, LUT, testing::Combine(testing::ValuesIn(device
          LUT(src, lut, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -115,8 +115,8 @@ PERF_TEST_P(DevInfo_Size, cartToPolar, testing::Combine(testing::ValuesIn(device
          cartToPolar(x, y, magnitude, angle);\r
      }\r
  \r
-    Mat magnitude_host = magnitude;\r
-    Mat angle_host = angle;\r
+    Mat magnitude_host(magnitude);\r
+    Mat angle_host(angle);\r
  \r
      SANITY_CHECK(magnitude_host);\r
      SANITY_CHECK(angle_host);\r
@@ -147,8 +147,8 @@ PERF_TEST_P(DevInfo_Size, polarToCart, testing::Combine(testing::ValuesIn(device
          polarToCart(magnitude, angle, x, y);\r
      }\r
  \r
-    Mat x_host = x;\r
-    Mat y_host = angle;\r
+    Mat x_host(x);\r
+    Mat y_host(y);\r
  \r
      SANITY_CHECK(x_host);\r
      SANITY_CHECK(y_host);\r
@@ -180,7 +180,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addMat, testing::Combine(testing::ValuesIn(dev
          add(a, b, c);\r
      }\r
  \r
-    Mat c_host = c;\r
+    Mat c_host(c);\r
  \r
      SANITY_CHECK(c_host);\r
  }\r
@@ -210,7 +210,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addScalar, testing::Combine(testing::ValuesIn(
          add(a, b, c);\r
      }\r
  \r
-    Mat c_host = c;\r
+    Mat c_host(c);\r
  \r
      SANITY_CHECK(c_host);\r
  }\r
@@ -241,7 +241,7 @@ PERF_TEST_P(DevInfo_Size_MatType, subtractMat, testing::Combine(testing::ValuesI
          subtract(a, b, c);\r
      }\r
  \r
-    Mat c_host = c;\r
+    Mat c_host(c);\r
  \r
      SANITY_CHECK(c_host);\r
  }\r
@@ -270,7 +270,7 @@ PERF_TEST_P(DevInfo_Size, multiplyMat, testing::Combine(testing::ValuesIn(device
          multiply(a, b, c);\r
      }\r
  \r
-    Mat c_host = c;\r
+    Mat c_host(c);\r
  \r
      SANITY_CHECK(c_host);\r
  }\r
@@ -300,7 +300,7 @@ PERF_TEST_P(DevInfo_Size_MatType, multiplyScalar, testing::Combine(testing::Valu
          multiply(a, b, c);\r
      }\r
  \r
-    Mat c_host = c;\r
+    Mat c_host(c);\r
  \r
      SANITY_CHECK(c_host);\r
  }\r
@@ -327,7 +327,7 @@ PERF_TEST_P(DevInfo_Size, exp, testing::Combine(testing::ValuesIn(devices()),
          exp(a, b);\r
      }\r
  \r
-    Mat b_host = b;\r
+    Mat b_host(b);\r
  \r
      SANITY_CHECK(b_host);\r
  }\r
@@ -356,7 +356,7 @@ PERF_TEST_P(DevInfo_Size_MatType, pow, testing::Combine(testing::ValuesIn(device
          pow(src, 2.0, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -389,7 +389,7 @@ PERF_TEST_P(DevInfo_Size_MatType_CmpOp, compare, testing::Combine(testing::Value
          compare(src1, src2, dst, cmpop);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -418,7 +418,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_not, testing::Combine(testing::ValuesI
          bitwise_not(src, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -449,7 +449,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_and, testing::Combine(testing::ValuesI
          bitwise_and(src1, src2, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -480,7 +480,7 @@ PERF_TEST_P(DevInfo_Size_MatType, min, testing::Combine(testing::ValuesIn(device
          min(src1, src2, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -712,7 +712,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI
          addWeighted(src1, 0.5, src2, 0.5, 0.0, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -743,7 +743,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val
          reduce(src, dst, dim, CV_REDUCE_MIN);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -774,7 +774,7 @@ PERF_TEST_P(DevInfo_Size, gemm, testing::Combine(testing::ValuesIn(devices()),
          gemm(src1, src2, 1.0, src3, 1.0, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
diff --git a/modules/gpu/perf/perf_calib3d.cpp b/modules/gpu/perf/perf_calib3d.cpp

index e84f87b..4ac922e 100644 (file)
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -20,7 +20,7 @@ PERF_TEST_P(DevInfo, transformPoints, testing::ValuesIn(devices()))
          transformPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -45,7 +45,7 @@ PERF_TEST_P(DevInfo, projectPoints, testing::ValuesIn(devices()))
          projectPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), Mat::ones(3, 3, CV_32FC1), Mat(), dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp

index 55125f3..da813bf 100644 (file)
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -28,7 +28,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing
          filter->apply(src, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
          filter->apply(src, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -96,7 +96,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
          filter->apply(src, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -130,7 +130,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Com
          filter->apply(src, dst, Rect(0, 0, src.cols, src.rows));\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp

index 3f72641..7b8ffe4 100644 (file)
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -36,7 +36,7 @@ PERF_TEST_P(DevInfo_Size_MatType_Interpolation_BorderMode, remap, testing::Combi
          remap(src, dst, xmap, ymap, interpolation, borderMode);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo, meanShiftFiltering, testing::ValuesIn(devices()))
          meanShiftFiltering(src, dst, 50, 50);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -91,8 +91,8 @@ PERF_TEST_P(DevInfo, meanShiftProc, testing::ValuesIn(devices()))
          meanShiftProc(src, dstr, dstsp, 50, 50);\r
      }\r
  \r
-    Mat dstr_host = dstr;\r
-    Mat dstsp_host = dstsp;\r
+    Mat dstr_host(dstr);\r
+    Mat dstsp_host(dstsp);\r
  \r
      SANITY_CHECK(dstr_host);\r
      SANITY_CHECK(dstsp_host);\r
diff --git a/modules/gpu/perf/perf_matop.cpp b/modules/gpu/perf/perf_matop.cpp

index d1505f5..ba66a0c 100644 (file)
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -25,7 +25,7 @@ PERF_TEST_P(DevInfo_Size_MatType, merge, testing::Combine(testing::ValuesIn(devi
          merge(src, dst);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -82,7 +82,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setTo, testing::Combine(testing::ValuesIn(devi
          src.setTo(val);\r
      }\r
  \r
-    Mat src_host = src;\r
+    Mat src_host(src);\r
  \r
      SANITY_CHECK(src_host);\r
  }\r
@@ -115,7 +115,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setToMasked, testing::Combine(testing::ValuesI
          src.setTo(val, mask);\r
      }\r
  \r
-    src_host = src;\r
+    src.download(src_host);\r
  \r
      SANITY_CHECK(src_host);\r
  }\r
@@ -148,7 +148,7 @@ PERF_TEST_P(DevInfo_Size_MatType, copyToMasked, testing::Combine(testing::Values
          src.copyTo(dst, mask);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
@@ -182,7 +182,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MatType, convertTo, testing::Combine(testing::V
          src.convertTo(dst, type2, a, b);\r
      }\r
  \r
-    Mat dst_host = dst;\r
+    Mat dst_host(dst);\r
  \r
      SANITY_CHECK(dst_host);\r
  }\r
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp

index fabb3df..a47d222 100644 (file)
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -425,16 +425,22 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
  ////////////////////////////////////////////////////////////////////////\r
  // Polar <-> Cart\r
  \r
-namespace cv { namespace gpu { namespace mathfunc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace mathfunc \r
  {\r
-    void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream);\r
-    void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);\r
-}}}\r
+    void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
+    void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
      inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)\r
      {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;\r
+\r
          CV_DbgAssert(x.size() == y.size() && x.type() == y.type());\r
          CV_Assert(x.depth() == CV_32F);\r
  \r
@@ -448,11 +454,13 @@ namespace
          GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();\r
          GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();\r
  \r
-        mathfunc::cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);\r
+        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);\r
      }\r
  \r
      inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)\r
      {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;\r
+\r
          CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());\r
          CV_Assert(mag.depth() == CV_32F);\r
  \r
@@ -464,34 +472,33 @@ namespace
          GpuMat x1cn = x.reshape(1);\r
          GpuMat y1cn = y.reshape(1);\r
  \r
-        mathfunc::polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);\r
+        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);\r
      }\r
  }\r
  \r
  void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)\r
  {\r
-    ::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));\r
+    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));\r
  }\r
  \r
  void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)\r
  {\r
-    ::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));\r
+    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));\r
  }\r
  \r
  void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)\r
  {\r
-    ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));\r
+    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));\r
  }\r
  \r
  void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)\r
  {\r
-    ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));\r
+    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));\r
  }\r
  \r
  void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)\r
  {\r
-    ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));\r
+    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));\r
  }\r
  \r
-\r
  #endif /* !defined (HAVE_CUDA) */\r
diff --git a/modules/gpu/src/bilateral_filter.cpp b/modules/gpu/src/bilateral_filter.cpp

index bc2bec2..12c159a 100644 (file)
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -55,13 +55,19 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu { namespace bf \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace bilateral_filter\r
  {\r
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc);\r
+    void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);\r
+\r
+    void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);\r
+    void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
-    void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream);\r
-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream);\r
-}}}\r
+using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;\r
  \r
  namespace\r
  {\r
@@ -105,7 +111,7 @@ namespace
          short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));\r
          short max_disc = short(ndisp * max_disc_threshold + 0.5);\r
  \r
-        bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);\r
+        load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);\r
  \r
          if (&dst != &disp)\r
          {\r
@@ -115,7 +121,7 @@ namespace
                  disp.copyTo(dst);\r
          }\r
  \r
-        bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));\r
+        bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));\r
      }\r
  \r
      typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, \r
diff --git a/modules/gpu/src/blend.cpp b/modules/gpu/src/blend.cpp

index 54345f6..4c4afc5 100644 (file)
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -52,15 +52,19 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu
  \r
  #else\r
  \r
-namespace cv { namespace gpu \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace blend\r
  {\r
      template <typename T>\r
-    void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, \r
-                           const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream);\r
+    void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);\r
+\r
+    void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
-    void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, \r
-                               const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream);\r
-}}\r
+using namespace OPENCV_DEVICE_NAMESPACE_ blend;\r
  \r
  void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, \r
                            GpuMat& result, Stream& stream)\r
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp

index 4f9bd5c..1d93146 100644 (file)
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -82,7 +82,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu { namespace bf_match\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace bf_match\r
  {\r
      template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
          const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
@@ -103,9 +105,9 @@ namespace cv { namespace gpu { namespace bf_match
      template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
          const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,\r
          int cc, cudaStream_t stream);\r
-}}}\r
+}\r
  \r
-namespace cv { namespace gpu { namespace bf_knnmatch\r
+namespace bf_knnmatch\r
  {\r
      template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
          const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
@@ -126,9 +128,9 @@ namespace cv { namespace gpu { namespace bf_knnmatch
      template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
          const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
          int cc, cudaStream_t stream);\r
-}}}\r
+}\r
  \r
-namespace cv { namespace gpu { namespace bf_radius_match \r
+namespace bf_radius_match \r
  {\r
      template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
          const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
@@ -151,15 +153,17 @@ namespace cv { namespace gpu { namespace bf_radius_match
      template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
          const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
          int cc, cudaStream_t stream);\r
-}}}\r
-\r
-cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)\r
-{\r
  }\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
  ////////////////////////////////////////////////////////////////////\r
  // Train collection\r
  \r
+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)\r
+{\r
+}\r
+\r
  void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)\r
  {\r
      trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());\r
@@ -195,7 +199,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
      if (query.empty() || train.empty())\r
          return;\r
  \r
-    using namespace cv::gpu::bf_match;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
                               const DevMem2Di& trainIdx, const DevMem2Df& distance,\r
@@ -242,8 +246,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
      if (trainIdx.empty() || distance.empty())\r
          return;\r
  \r
-    Mat trainIdxCPU = trainIdx;\r
-    Mat distanceCPU = distance;\r
+    Mat trainIdxCPU(trainIdx);\r
+    Mat distanceCPU(distance);\r
  \r
      matchConvert(trainIdxCPU, distanceCPU, matches);\r
  }\r
@@ -337,7 +341,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
      if (query.empty() || trainCollection.empty())\r
          return;\r
  \r
-    using namespace cv::gpu::bf_match;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
@@ -384,9 +388,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
      if (trainIdx.empty() || imgIdx.empty() || distance.empty())\r
          return;\r
  \r
-    Mat trainIdxCPU = trainIdx;\r
-    Mat imgIdxCPU = imgIdx;\r
-    Mat distanceCPU = distance;\r
+    Mat trainIdxCPU(trainIdx);\r
+    Mat imgIdxCPU(imgIdx);\r
+    Mat distanceCPU(distance);\r
  \r
      matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);\r
  }\r
@@ -448,7 +452,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
      if (query.empty() || train.empty())\r
          return;\r
  \r
-    using namespace cv::gpu::bf_knnmatch;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
                               const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
@@ -511,8 +515,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
      if (trainIdx.empty() || distance.empty())\r
          return;\r
  \r
-    Mat trainIdxCPU = trainIdx;\r
-    Mat distanceCPU = distance;\r
+    Mat trainIdxCPU(trainIdx);\r
+    Mat distanceCPU(distance);\r
  \r
      knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);\r
  }\r
@@ -577,7 +581,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
      if (query.empty() || trainCollection.empty())\r
          return;\r
  \r
-    using namespace cv::gpu::bf_knnmatch;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
                               const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
@@ -630,9 +634,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainI
      if (trainIdx.empty() || imgIdx.empty() || distance.empty())\r
          return;\r
  \r
-    Mat trainIdxCPU = trainIdx;\r
-    Mat imgIdxCPU = imgIdx;\r
-    Mat distanceCPU = distance;\r
+    Mat trainIdxCPU(trainIdx);\r
+    Mat imgIdxCPU(imgIdx);\r
+    Mat distanceCPU(distance);\r
  \r
      knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);\r
  }\r
@@ -758,7 +762,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
      if (query.empty() || train.empty())\r
          return;\r
  \r
-    using namespace cv::gpu::bf_radius_match;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
                               const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
@@ -819,9 +823,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
      if (trainIdx.empty() || distance.empty() || nMatches.empty())\r
          return;\r
  \r
-    Mat trainIdxCPU = trainIdx;\r
-    Mat distanceCPU = distance;\r
-    Mat nMatchesCPU = nMatches;\r
+    Mat trainIdxCPU(trainIdx);\r
+    Mat distanceCPU(distance);\r
+    Mat nMatchesCPU(nMatches);\r
  \r
      radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);\r
  }\r
@@ -889,7 +893,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
      if (query.empty() || empty())\r
          return;\r
  \r
-    using namespace cv::gpu::bf_radius_match;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
                               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
@@ -953,10 +957,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
      if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())\r
          return;\r
  \r
-    Mat trainIdxCPU = trainIdx;\r
-    Mat imgIdxCPU = imgIdx;\r
-    Mat distanceCPU = distance;\r
-    Mat nMatchesCPU = nMatches;\r
+    Mat trainIdxCPU(trainIdx);\r
+    Mat imgIdxCPU(imgIdx);\r
+    Mat distanceCPU(distance);\r
+    Mat nMatchesCPU(nMatches);\r
  \r
      radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);\r
  }\r
diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp

index 301ea81..8e6e838 100644 (file)
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -42,6 +42,10 @@
  \r
  #include "precomp.hpp"\r
  \r
+using namespace cv;\r
+using namespace cv::gpu;\r
+using namespace std;\r
+\r
  #if !defined(HAVE_CUDA)\r
  \r
  void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }\r
@@ -52,13 +56,31 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat
  \r
  #else\r
  \r
-using namespace cv;\r
-using namespace cv::gpu;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace transform_points \r
+namespace transform_points \r
  {\r
      void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+namespace project_points \r
+{\r
+    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);\r
+}\r
+\r
+namespace solve_pnp_ransac\r
+{\r
+    int maxNumIters();\r
+\r
+    void computeHypothesisScores(\r
+            const int num_hypotheses, const int num_points, const float* rot_matrices,\r
+            const float3* transl_vectors, const float3* object, const float2* image,\r
+            const float dist_threshold, int* hypothesis_scores);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
  namespace\r
  {\r
@@ -79,15 +101,9 @@ namespace
  \r
  void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)\r
  {\r
-    ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));\r
+    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));\r
  }\r
  \r
-namespace cv { namespace gpu { namespace project_points \r
-{\r
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);\r
-}}}\r
-\r
-\r
  namespace\r
  {\r
      void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)\r
@@ -109,20 +125,9 @@ namespace
  \r
  void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)\r
  {\r
-    ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));\r
+    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));\r
  }\r
  \r
-\r
-namespace cv { namespace gpu { namespace solve_pnp_ransac\r
-{\r
-    int maxNumIters();\r
-\r
-    void computeHypothesisScores(\r
-            const int num_hypotheses, const int num_points, const float* rot_matrices,\r
-            const float3* transl_vectors, const float3* object, const float2* image,\r
-            const float dist_threshold, int* hypothesis_scores);\r
-}}}\r
-\r
  namespace\r
  {\r
      // Selects subset_size random different points from [0, num_points - 1] range\r
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp

index 0af5fa2..a6b7da9 100644 (file)
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -46,7 +46,6 @@ using namespace cv;
  using namespace cv::gpu;\r
  using namespace std;\r
  \r
-\r
  #if !defined (HAVE_CUDA)\r
  \r
  cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()  { throw_nogpu(); }\r
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp

index 69b0030..c4f8b60 100644 (file)
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -51,155 +51,158 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu {  namespace device  \r
-{\r
-    #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \\r
-        void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)\r
-\r
-    #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \\r
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)\r
-\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)\r
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)\r
-\r
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE\r
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL\r
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \\r
+    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)\r
+\r
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \\r
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)\r
+\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)\r
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)\r
+\r
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE\r
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL\r
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
  namespace\r
  {\r
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu

index f53af9e..c8b1171 100644 (file)
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -45,1115 +45,1117 @@
  #include "opencv2/gpu/device/vec_distance.hpp"\r
  #include "opencv2/gpu/device/datamov_utils.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace bf_knnmatch\r
-{\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Reduction\r
+namespace bf_knnmatch {\r
  \r
-    template <int BLOCK_SIZE> \r
-    __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
-                                  int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                  float* s_distance, int* s_trainIdx)\r
-    {\r
-        float myBestDistance1 = numeric_limits<float>::max(); \r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Reduction\r
+\r
+template <int BLOCK_SIZE> \r
+__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
+                              int& bestTrainIdx1, int& bestTrainIdx2, \r
+                              float* s_distance, int* s_trainIdx)\r
+{\r
+    float myBestDistance1 = numeric_limits<float>::max(); \r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
  \r
-        s_distance += threadIdx.y * BLOCK_SIZE;\r
-        s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+    s_distance += threadIdx.y * BLOCK_SIZE;\r
+    s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
  \r
-        s_distance[threadIdx.x] = bestDistance1;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
+    s_distance[threadIdx.x] = bestDistance1;\r
+    s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        if (threadIdx.x == 0)\r
+    if (threadIdx.x == 0)\r
+    {\r
+        #pragma unroll\r
+        for (int i = 0; i < BLOCK_SIZE; ++i)\r
          {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
+            float val = s_distance[i];\r
+\r
+            if (val < myBestDistance1)\r
+            {\r
+                myBestDistance2 = myBestDistance1;\r
+                myBestTrainIdx2 = myBestTrainIdx1;\r
+\r
+                myBestDistance1 = val;\r
+                myBestTrainIdx1 = s_trainIdx[i];\r
+            }\r
+            else if (val < myBestDistance2)\r
              {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance1)\r
-                {\r
-                    myBestDistance2 = myBestDistance1;\r
-                    myBestTrainIdx2 = myBestTrainIdx1;\r
-\r
-                    myBestDistance1 = val;\r
-                    myBestTrainIdx1 = s_trainIdx[i];\r
-                }\r
-                else if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                }\r
+                myBestDistance2 = val;\r
+                myBestTrainIdx2 = s_trainIdx[i];\r
              }\r
          }\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        s_distance[threadIdx.x] = bestDistance2;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
+    s_distance[threadIdx.x] = bestDistance2;\r
+    s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        if (threadIdx.x == 0)\r
+    if (threadIdx.x == 0)\r
+    {\r
+        #pragma unroll\r
+        for (int i = 0; i < BLOCK_SIZE; ++i)\r
          {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
-            {\r
-                float val = s_distance[i];\r
+            float val = s_distance[i];\r
  \r
-                if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                }\r
+            if (val < myBestDistance2)\r
+            {\r
+                myBestDistance2 = val;\r
+                myBestTrainIdx2 = s_trainIdx[i];\r
              }\r
          }\r
+    }\r
  \r
-        bestDistance1 = myBestDistance1;\r
-        bestDistance2 = myBestDistance2;\r
+    bestDistance1 = myBestDistance1;\r
+    bestDistance2 = myBestDistance2;\r
  \r
-        bestTrainIdx1 = myBestTrainIdx1;\r
-        bestTrainIdx2 = myBestTrainIdx2;\r
-    }\r
+    bestTrainIdx1 = myBestTrainIdx1;\r
+    bestTrainIdx2 = myBestTrainIdx2;\r
+}\r
  \r
-    template <int BLOCK_SIZE> \r
-    __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
-                                   int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                   int& bestImgIdx1, int& bestImgIdx2, \r
-                                   float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
-    {\r
-        float myBestDistance1 = numeric_limits<float>::max(); \r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
+template <int BLOCK_SIZE> \r
+__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
+                               int& bestTrainIdx1, int& bestTrainIdx2, \r
+                               int& bestImgIdx1, int& bestImgIdx2, \r
+                               float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
+{\r
+    float myBestDistance1 = numeric_limits<float>::max(); \r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
+    int myBestImgIdx1 = -1;\r
+    int myBestImgIdx2 = -1;\r
  \r
-        s_distance += threadIdx.y * BLOCK_SIZE;\r
-        s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
-        s_imgIdx   += threadIdx.y * BLOCK_SIZE;\r
+    s_distance += threadIdx.y * BLOCK_SIZE;\r
+    s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+    s_imgIdx   += threadIdx.y * BLOCK_SIZE;\r
  \r
-        s_distance[threadIdx.x] = bestDistance1;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
-        s_imgIdx[threadIdx.x]   = bestImgIdx1;\r
+    s_distance[threadIdx.x] = bestDistance1;\r
+    s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
+    s_imgIdx[threadIdx.x]   = bestImgIdx1;\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        if (threadIdx.x == 0)\r
+    if (threadIdx.x == 0)\r
+    {\r
+        #pragma unroll\r
+        for (int i = 0; i < BLOCK_SIZE; ++i)\r
          {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
+            float val = s_distance[i];\r
+\r
+            if (val < myBestDistance1)\r
+            {\r
+                myBestDistance2 = myBestDistance1;\r
+                myBestTrainIdx2 = myBestTrainIdx1;\r
+                myBestImgIdx2   = myBestImgIdx1;\r
+\r
+                myBestDistance1 = val;\r
+                myBestTrainIdx1 = s_trainIdx[i];\r
+                myBestImgIdx1   = s_imgIdx[i];\r
+            }\r
+            else if (val < myBestDistance2)\r
              {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance1)\r
-                {\r
-                    myBestDistance2 = myBestDistance1;\r
-                    myBestTrainIdx2 = myBestTrainIdx1;\r
-                    myBestImgIdx2   = myBestImgIdx1;\r
-\r
-                    myBestDistance1 = val;\r
-                    myBestTrainIdx1 = s_trainIdx[i];\r
-                    myBestImgIdx1   = s_imgIdx[i];\r
-                }\r
-                else if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                    myBestImgIdx2   = s_imgIdx[i];\r
-                }\r
+                myBestDistance2 = val;\r
+                myBestTrainIdx2 = s_trainIdx[i];\r
+                myBestImgIdx2   = s_imgIdx[i];\r
              }\r
          }\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        s_distance[threadIdx.x] = bestDistance2;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
-        s_imgIdx[threadIdx.x]   = bestImgIdx2;\r
+    s_distance[threadIdx.x] = bestDistance2;\r
+    s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
+    s_imgIdx[threadIdx.x]   = bestImgIdx2;\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        if (threadIdx.x == 0)\r
+    if (threadIdx.x == 0)\r
+    {\r
+        #pragma unroll\r
+        for (int i = 0; i < BLOCK_SIZE; ++i)\r
          {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
+            float val = s_distance[i];\r
+\r
+            if (val < myBestDistance2)\r
              {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                    myBestImgIdx2   = s_imgIdx[i];\r
-                }\r
+                myBestDistance2 = val;\r
+                myBestTrainIdx2 = s_trainIdx[i];\r
+                myBestImgIdx2   = s_imgIdx[i];\r
              }\r
          }\r
+    }\r
  \r
-        bestDistance1 = myBestDistance1;\r
-        bestDistance2 = myBestDistance2;\r
+    bestDistance1 = myBestDistance1;\r
+    bestDistance2 = myBestDistance2;\r
  \r
-        bestTrainIdx1 = myBestTrainIdx1;\r
-        bestTrainIdx2 = myBestTrainIdx2;\r
+    bestTrainIdx1 = myBestTrainIdx1;\r
+    bestTrainIdx2 = myBestTrainIdx2;\r
  \r
-        bestImgIdx1 = myBestImgIdx1;\r
-        bestImgIdx2 = myBestImgIdx2;\r
-    }\r
+    bestImgIdx1 = myBestImgIdx1;\r
+    bestImgIdx2 = myBestImgIdx2;\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled Cached\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match Unrolled Cached\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
-    __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
+__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
+{\r
+    #pragma unroll\r
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
      {\r
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+        s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;\r
+    }\r
+}\r
+\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+                                   typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+                                   float& bestDistance1, float& bestDistance2, \r
+                                   int& bestTrainIdx1, int& bestTrainIdx2, \r
+                                   int& bestImgIdx1, int& bestImgIdx2)\r
+{\r
+    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
+    {\r
+        Dist dist;\r
+\r
          #pragma unroll\r
          for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
          {\r
              const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-            s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;\r
-        }\r
-    }\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                                       typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                                       float& bestDistance1, float& bestDistance2, \r
-                                       int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                       int& bestImgIdx1, int& bestImgIdx2)\r
-    {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-            #pragma unroll\r
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+            if (loadX < train.cols)\r
              {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+                T val;\r
  \r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
-                if (loadX < train.cols)\r
-                {\r
-                    T val;\r
+                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+            }\r
  \r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
+            __syncthreads();\r
  \r
-                __syncthreads();\r
+            #pragma unroll\r
+            for (int j = 0; j < BLOCK_SIZE; ++j)\r
+                dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+            __syncthreads();\r
+        }\r
  \r
-                __syncthreads();\r
-            }\r
+        typename Dist::result_type distVal = dist;\r
  \r
-            typename Dist::result_type distVal = dist;\r
+        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
  \r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+        {\r
+            if (distVal < bestDistance1)\r
+            {\r
+                bestImgIdx2   = bestImgIdx1;\r
+                bestDistance2 = bestDistance1;\r
+                bestTrainIdx2 = bestTrainIdx1;\r
  \r
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+                bestImgIdx1   = imgIdx;\r
+                bestDistance1 = distVal;\r
+                bestTrainIdx1 = trainIdx;\r
+            }\r
+            else if (distVal < bestDistance2)\r
              {\r
-                if (distVal < bestDistance1)\r
-                {\r
-                    bestImgIdx2   = bestImgIdx1;\r
-                    bestDistance2 = bestDistance1;\r
-                    bestTrainIdx2 = bestTrainIdx1;\r
-\r
-                    bestImgIdx1   = imgIdx;\r
-                    bestDistance1 = distVal;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (distVal < bestDistance2)\r
-                {\r
-                    bestImgIdx2   = imgIdx;\r
-                    bestDistance2 = distVal;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
+                bestImgIdx2   = imgIdx;\r
+                bestDistance2 = distVal;\r
+                bestTrainIdx2 = trainIdx;\r
              }\r
          }\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
  \r
-        loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
  \r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
+    float myBestDistance1 = numeric_limits<float>::max();\r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
  \r
-        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
+    loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                             const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-                             cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                         const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
+                         cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
  \r
-        loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
  \r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
+    float myBestDistance1 = numeric_limits<float>::max();\r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
+    int myBestImgIdx1 = -1;\r
+    int myBestImgIdx2 = -1;\r
  \r
-        Mask m = mask;\r
+    Mask m = mask;\r
  \r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
-        }\r
+    for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+    {\r
+        const DevMem2D_<T> train = trains[imgIdx];\r
+        m.next();\r
+        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+        bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
+        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                             const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
-                             cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+                         const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
+                         cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match Unrolled\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                                 typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                                 float& bestDistance1, float& bestDistance2, \r
-                                 int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                 int& bestImgIdx1, int& bestImgIdx2)\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+                             float& bestDistance1, float& bestDistance2, \r
+                             int& bestTrainIdx1, int& bestTrainIdx2, \r
+                             int& bestImgIdx1, int& bestImgIdx2)\r
+{\r
+    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
      {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
+        Dist dist;\r
+\r
+        #pragma unroll\r
+        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
          {\r
-            Dist dist;\r
+            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-            #pragma unroll\r
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+            if (loadX < query.cols)\r
+            {\r
+                T val;\r
  \r
-                if (loadX < query.cols)\r
-                {\r
-                    T val;\r
+                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
  \r
-                    ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+            }\r
  \r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
+            __syncthreads();\r
  \r
-                __syncthreads();\r
+            #pragma unroll\r
+            for (int j = 0; j < BLOCK_SIZE; ++j)\r
+                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+            __syncthreads();\r
+        }\r
  \r
-                __syncthreads();\r
-            }\r
+        typename Dist::result_type distVal = dist;\r
  \r
-            typename Dist::result_type distVal = dist;\r
+        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
  \r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+        {\r
+            if (distVal < bestDistance1)\r
+            {\r
+                bestImgIdx2   = bestImgIdx1;\r
+                bestDistance2 = bestDistance1;\r
+                bestTrainIdx2 = bestTrainIdx1;\r
  \r
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+                bestImgIdx1   = imgIdx;\r
+                bestDistance1 = distVal;\r
+                bestTrainIdx1 = trainIdx;\r
+            }\r
+            else if (distVal < bestDistance2)\r
              {\r
-                if (distVal < bestDistance1)\r
-                {\r
-                    bestImgIdx2   = bestImgIdx1;\r
-                    bestDistance2 = bestDistance1;\r
-                    bestTrainIdx2 = bestTrainIdx1;\r
-\r
-                    bestImgIdx1   = imgIdx;\r
-                    bestDistance1 = distVal;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (distVal < bestDistance2)\r
-                {\r
-                    bestImgIdx2   = imgIdx;\r
-                    bestDistance2 = distVal;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
+                bestImgIdx2   = imgIdx;\r
+                bestDistance2 = distVal;\r
+                bestTrainIdx2 = trainIdx;\r
              }\r
          }\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
+    float myBestDistance1 = numeric_limits<float>::max();\r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
  \r
-        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
+    loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                       const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-                       cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
+                   cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
+    float myBestDistance1 = numeric_limits<float>::max();\r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
+    int myBestImgIdx1 = -1;\r
+    int myBestImgIdx2 = -1;\r
  \r
-        Mask m = mask;\r
+    Mask m = mask;\r
  \r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
-        }\r
+    for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+    {\r
+        const DevMem2D_<T> train = trains[imgIdx];\r
+        m.next();\r
+        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+        bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
+        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                       const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
-                       cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+                   const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
+                   cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                         typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                         float& bestDistance1, float& bestDistance2, \r
-                         int& bestTrainIdx1, int& bestTrainIdx2, \r
-                         int& bestImgIdx1, int& bestImgIdx2)\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+                     float& bestDistance1, float& bestDistance2, \r
+                     int& bestTrainIdx1, int& bestTrainIdx2, \r
+                     int& bestImgIdx1, int& bestImgIdx2)\r
+{\r
+    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
      {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
+        Dist dist;\r
+\r
+        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
          {\r
-            Dist dist;\r
+            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+            if (loadX < query.cols)\r
+            {\r
+                T val;\r
  \r
-                if (loadX < query.cols)\r
-                {\r
-                    T val;\r
+                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
  \r
-                    ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+            }\r
  \r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
+            __syncthreads();\r
  \r
-                __syncthreads();\r
+            #pragma unroll\r
+            for (int j = 0; j < BLOCK_SIZE; ++j)\r
+                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+            __syncthreads();\r
+        }\r
  \r
-                __syncthreads();\r
-            }\r
+        typename Dist::result_type distVal = dist;\r
  \r
-            typename Dist::result_type distVal = dist;\r
+        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
  \r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+        {\r
+            if (distVal < bestDistance1)\r
+            {\r
+                bestImgIdx2   = bestImgIdx1;\r
+                bestDistance2 = bestDistance1;\r
+                bestTrainIdx2 = bestTrainIdx1;\r
  \r
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
+                bestImgIdx1   = imgIdx;\r
+                bestDistance1 = distVal;\r
+                bestTrainIdx1 = trainIdx;\r
+            }\r
+            else if (distVal < bestDistance2)\r
              {\r
-                if (distVal < bestDistance1)\r
-                {\r
-                    bestImgIdx2   = bestImgIdx1;\r
-                    bestDistance2 = bestDistance1;\r
-                    bestTrainIdx2 = bestTrainIdx1;\r
-\r
-                    bestImgIdx1   = imgIdx;\r
-                    bestDistance1 = distVal;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (distVal < bestDistance2)\r
-                {\r
-                    bestImgIdx2   = imgIdx;\r
-                    bestDistance2 = distVal;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
+                bestImgIdx2   = imgIdx;\r
+                bestDistance2 = distVal;\r
+                bestTrainIdx2 = trainIdx;\r
              }\r
          }\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
+    float myBestDistance1 = numeric_limits<float>::max();\r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
  \r
-        loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
+    loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-               const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-               cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+           const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
+           cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
+    float myBestDistance1 = numeric_limits<float>::max();\r
+    float myBestDistance2 = numeric_limits<float>::max();\r
+    int myBestTrainIdx1 = -1;\r
+    int myBestTrainIdx2 = -1;\r
+    int myBestImgIdx1 = -1;\r
+    int myBestImgIdx2 = -1;\r
  \r
-        Mask m = mask;\r
+    Mask m = mask;\r
  \r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
-        }\r
+    for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+    {\r
+        const DevMem2D_<T> train = trains[imgIdx];\r
+        m.next();\r
+        loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
+        bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
+        bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-               const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
-               cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+           const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
+           cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knnMatch 2 dispatcher\r
+///////////////////////////////////////////////////////////////////////////////\r
+// knnMatch 2 dispatcher\r
  \r
-    template <typename Dist, typename T, typename Mask> \r
-    void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                          const DevMem2Db& trainIdx, const DevMem2Db& distance, \r
-                          int cc, cudaStream_t stream)\r
+template <typename Dist, typename T, typename Mask> \r
+void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                      const DevMem2Db& trainIdx, const DevMem2Db& distance, \r
+                      int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
      {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
+        matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
      }\r
-\r
-    template <typename Dist, typename T, typename Mask> \r
-    void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                          const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-                          int cc, cudaStream_t stream)\r
+    else if (query.cols <= 128)\r
      {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
+        matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
      }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Calc distance kernel\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-    __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
+    /*else if (query.cols <= 256)\r
      {\r
-        extern __shared__ int smem[];\r
+        matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+    else if (query.cols <= 512)\r
+    {            \r
+        matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }*/\r
+    else\r
+    {\r
+        match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+}\r
  \r
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+template <typename Dist, typename T, typename Mask> \r
+void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+                      const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+                      int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
+    {\r
+        matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+    else if (query.cols <= 128)\r
+    {\r
+        matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+    /*else if (query.cols <= 256)\r
+    {\r
+        matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+    else if (query.cols <= 512)\r
+    {            \r
+        matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }*/\r
+    else\r
+    {\r
+        match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
+    }\r
+}\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Calc distance kernel\r
  \r
-        Dist dist;\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
+__global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        #pragma unroll\r
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-        {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
  \r
-            if (loadX < query.cols)\r
-            {\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
-            }\r
-            else\r
-            {                \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-            }\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-            __syncthreads();\r
+    Dist dist;\r
  \r
-            #pragma unroll\r
-            for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+    #pragma unroll\r
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+    {\r
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-            __syncthreads();\r
+        if (loadX < query.cols)\r
+        {\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
+        }\r
+        else\r
+        {                \r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
          }\r
  \r
-        if (queryIdx < query.rows && trainIdx < train.rows)\r
-        {\r
-            float distVal = numeric_limits<float>::max();\r
+        __syncthreads();\r
  \r
-            if (mask(queryIdx, trainIdx))\r
-                distVal = (typename Dist::result_type)dist;\r
+        #pragma unroll\r
+        for (int j = 0; j < BLOCK_SIZE; ++j)\r
+            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-            allDist.ptr(queryIdx)[trainIdx] = distVal;\r
-        }\r
+        __syncthreads();\r
      }\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
+    if (queryIdx < query.rows && trainIdx < train.rows)\r
      {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+        float distVal = numeric_limits<float>::max();\r
  \r
-        calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
-        cudaSafeCall( cudaGetLastError() );\r
+        if (mask(queryIdx, trainIdx))\r
+            distVal = (typename Dist::result_type)dist;\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        allDist.ptr(queryIdx)[trainIdx] = distVal;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-    __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        Dist dist;\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
-        {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
+__global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-            if (loadX < query.cols)\r
-            {\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
-            }\r
-            else\r
-            {                \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-            }\r
+    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
  \r
-            __syncthreads();\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-            #pragma unroll\r
-            for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+    Dist dist;\r
  \r
-            __syncthreads();\r
-        }\r
+    for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+    {\r
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-        if (queryIdx < query.rows && trainIdx < train.rows)\r
+        if (loadX < query.cols)\r
          {\r
-            float distVal = numeric_limits<float>::max();\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
+        }\r
+        else\r
+        {                \r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+        }\r
  \r
-            if (mask(queryIdx, trainIdx))\r
-                distVal = (typename Dist::result_type)dist;\r
+        __syncthreads();\r
  \r
-            allDist.ptr(queryIdx)[trainIdx] = distVal;\r
-        }\r
+        #pragma unroll\r
+        for (int j = 0; j < BLOCK_SIZE; ++j)\r
+            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+\r
+        __syncthreads();\r
      }\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
+    if (queryIdx < query.rows && trainIdx < train.rows)\r
      {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+        float distVal = numeric_limits<float>::max();\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
-        cudaSafeCall( cudaGetLastError() );\r
+        if (mask(queryIdx, trainIdx))\r
+            distVal = (typename Dist::result_type)dist;\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        allDist.ptr(queryIdx)[trainIdx] = distVal;\r
      }\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Calc Distance dispatcher\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
  \r
-    template <typename Dist, typename T, typename Mask> \r
-    void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                                const DevMem2Df& allDist, \r
-                                int cc, cudaStream_t stream)\r
-    {\r
-        if (query.cols <= 64)\r
-        {\r
-            calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);\r
-        }*/\r
-        else\r
-        {\r
-            calcDistance<16, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-    }\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // find knn match kernel\r
+    calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    template <int BLOCK_SIZE> \r
-    __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)\r
-    {\r
-        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
-        __shared__ float s_dist[SMEM_SIZE];\r
-        __shared__ int s_trainIdx[SMEM_SIZE];\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-        const int queryIdx = blockIdx.x;\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Calc Distance dispatcher\r
  \r
-        float* allDistRow = allDist.ptr(queryIdx);\r
+template <typename Dist, typename T, typename Mask> \r
+void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                            const DevMem2Df& allDist, \r
+                            int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
+    {\r
+        calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);\r
+    }\r
+    else if (query.cols <= 128)\r
+    {\r
+        calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);\r
+    }\r
+    /*else if (query.cols <= 256)\r
+    {\r
+        calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);\r
+    }\r
+    else if (query.cols <= 512)\r
+    {            \r
+        calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);\r
+    }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);\r
+    }*/\r
+    else\r
+    {\r
+        calcDistance<16, Dist>(query, train, mask, allDist, stream);\r
+    }\r
+}\r
  \r
-        float dist = numeric_limits<float>::max();\r
-        int bestIdx = -1;\r
-        \r
-        for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)\r
-        {\r
-            float reg = allDistRow[i];\r
-            if (reg < dist)\r
-            {\r
-                dist = reg;\r
-                bestIdx = i;\r
-            }\r
-        }\r
+///////////////////////////////////////////////////////////////////////////////\r
+// find knn match kernel\r
  \r
-        s_dist[threadIdx.x] = dist;\r
-        s_trainIdx[threadIdx.x] = bestIdx;\r
-        __syncthreads();\r
+template <int BLOCK_SIZE> \r
+__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)\r
+{\r
+    const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
+    __shared__ float s_dist[SMEM_SIZE];\r
+    __shared__ int s_trainIdx[SMEM_SIZE];\r
  \r
-        reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());\r
+    const int queryIdx = blockIdx.x;\r
  \r
-        if (threadIdx.x == 0)\r
-        {\r
-            if (dist < numeric_limits<float>::max())\r
-            {\r
-                allDistRow[bestIdx] = numeric_limits<float>::max();\r
-                trainIdx.ptr(queryIdx)[i] = bestIdx;\r
-                distance.ptr(queryIdx)[i] = dist;\r
-            }\r
-        }\r
-    }\r
+    float* allDistRow = allDist.ptr(queryIdx);\r
  \r
-    template <int BLOCK_SIZE> \r
-    void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
+    float dist = numeric_limits<float>::max();\r
+    int bestIdx = -1;\r
+    \r
+    for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)\r
      {\r
-        const dim3 block(BLOCK_SIZE, 1, 1);\r
-        const dim3 grid(trainIdx.rows, 1, 1);\r
-\r
-        for (int i = 0; i < k; ++i)\r
+        float reg = allDistRow[i];\r
+        if (reg < dist)\r
          {\r
-            findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);\r
-            cudaSafeCall( cudaGetLastError() );\r
+            dist = reg;\r
+            bestIdx = i;\r
          }\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
      }\r
  \r
-    void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)\r
-    {\r
-        findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);\r
-    }\r
+    s_dist[threadIdx.x] = dist;\r
+    s_trainIdx[threadIdx.x] = bestIdx;\r
+    __syncthreads();\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knn match Dispatcher\r
+    reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());\r
  \r
-    template <typename Dist, typename T, typename Mask>\r
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
-        int cc, cudaStream_t stream)\r
+    if (threadIdx.x == 0)\r
      {\r
-        if (k == 2)\r
-        {\r
-            match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);\r
-        }\r
-        else\r
+        if (dist < numeric_limits<float>::max())\r
          {\r
-            calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);\r
-            findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);\r
+            allDistRow[bestIdx] = numeric_limits<float>::max();\r
+            trainIdx.ptr(queryIdx)[i] = bestIdx;\r
+            distance.ptr(queryIdx)[i] = dist;\r
          }\r
-    }     \r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knn match caller\r
-\r
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (mask.data)\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
-        else\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
      }\r
+}\r
  \r
-    template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template <int BLOCK_SIZE> \r
+void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, 1, 1);\r
+    const dim3 grid(trainIdx.rows, 1, 1);\r
  \r
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,\r
-        int cc, cudaStream_t stream)\r
+    for (int i = 0; i < k; ++i)\r
      {\r
-        if (mask.data)\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
-        else\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
+        findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);\r
+        cudaSafeCall( cudaGetLastError() );\r
      }\r
  \r
-    //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,\r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (mask.data)\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
-        else\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-    }\r
+void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)\r
+{\r
+    findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);\r
+}\r
  \r
-    template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+///////////////////////////////////////////////////////////////////////////////\r
+// knn match Dispatcher\r
  \r
-    template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-        int cc, cudaStream_t stream)\r
+template <typename Dist, typename T, typename Mask>\r
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, \r
+    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (k == 2)\r
      {\r
-        if (masks.data)\r
-            match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
-        else\r
-            match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+        match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);\r
      }\r
-    \r
-    template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-        int cc, cudaStream_t stream)\r
+    else\r
      {\r
-        if (masks.data)\r
-            match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
-        else\r
-            match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+        calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);\r
+        findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);\r
      }\r
-    \r
-    //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    \r
-    template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (masks.data)\r
-            match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
-        else\r
-            match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-    }\r
-    \r
-    template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2Hamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-}}}\r
+}     \r
+\r
+///////////////////////////////////////////////////////////////////////////////\r
+// knn match caller\r
+\r
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
+    else\r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
+}\r
+\r
+template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
+    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,\r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
+    else\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
+}\r
+\r
+//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,\r
+    const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
+    else\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
+}\r
+\r
+template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+    const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (masks.data)\r
+        match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
+    else\r
+        match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+}\r
+\r
+template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+    const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (masks.data)\r
+        match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
+    else\r
+        match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+}\r
+\r
+//template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+    const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (masks.data)\r
+        match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
+    else\r
+        match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
+}\r
+\r
+template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+//template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+template void match2Hamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
+\r
+} // namespace bf_knnmatch\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu

index e46939f..0ab56be 100644 (file)
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -45,734 +45,736 @@
  #include "opencv2/gpu/device/vec_distance.hpp"\r
  #include "opencv2/gpu/device/datamov_utils.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace bf_match\r
+namespace bf_match {\r
+\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Reduction\r
+\r
+template <int BLOCK_SIZE> \r
+__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)\r
  {\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Reduction\r
+    s_distance += threadIdx.y * BLOCK_SIZE;\r
+    s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
  \r
-    template <int BLOCK_SIZE> \r
-    __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)\r
-    {\r
-        s_distance += threadIdx.y * BLOCK_SIZE;\r
-        s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+    s_distance[threadIdx.x] = bestDistance;\r
+    s_trainIdx[threadIdx.x] = bestTrainIdx;\r
  \r
-        s_distance[threadIdx.x] = bestDistance;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx;\r
+    __syncthreads();\r
  \r
-        __syncthreads();\r
+    reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());\r
+}\r
  \r
-        reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());\r
-    }\r
+template <int BLOCK_SIZE> \r
+__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
+{\r
+    s_distance += threadIdx.y * BLOCK_SIZE;\r
+    s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
+    s_imgIdx   += threadIdx.y * BLOCK_SIZE;\r
  \r
-    template <int BLOCK_SIZE> \r
-    __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
-    {\r
-        s_distance += threadIdx.y * BLOCK_SIZE;\r
-        s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
-        s_imgIdx   += threadIdx.y * BLOCK_SIZE;\r
+    s_distance[threadIdx.x] = bestDistance;\r
+    s_trainIdx[threadIdx.x] = bestTrainIdx;\r
+    s_imgIdx  [threadIdx.x] = bestImgIdx;\r
  \r
-        s_distance[threadIdx.x] = bestDistance;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx;\r
-        s_imgIdx  [threadIdx.x] = bestImgIdx;\r
+    __syncthreads();\r
  \r
-        __syncthreads();\r
+    reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());\r
+}\r
  \r
-        reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());\r
-    }\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match Unrolled Cached\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled Cached\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
+__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
+{\r
+    #pragma unroll\r
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+    {\r
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+        s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;\r
+    }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
-    __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+                                   typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+                                   float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+{\r
+    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
      {\r
+        Dist dist;\r
+\r
          #pragma unroll\r
          for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
          {\r
              const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-            s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;\r
-        }\r
-    }\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                                       typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                                       float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
-    {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-            #pragma unroll\r
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+            if (loadX < train.cols)\r
              {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+                T val;\r
  \r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
-                if (loadX < train.cols)\r
-                {\r
-                    T val;\r
-\r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
+                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+            }\r
  \r
-                __syncthreads();\r
+            __syncthreads();\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+            #pragma unroll\r
+            for (int j = 0; j < BLOCK_SIZE; ++j)\r
+                dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-                __syncthreads();\r
-            }\r
+            __syncthreads();\r
+        }\r
  \r
-            typename Dist::result_type distVal = dist;\r
+        typename Dist::result_type distVal = dist;\r
  \r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
  \r
-            if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
-            {\r
-                bestImgIdx = imgIdx;\r
-                bestDistance = distVal;\r
-                bestTrainIdx = trainIdx;\r
-            }\r
+        if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+        {\r
+            bestImgIdx = imgIdx;\r
+            bestDistance = distVal;\r
+            bestTrainIdx = trainIdx;\r
          }\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
  \r
-        loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
  \r
-        float myBestDistance = numeric_limits<float>::max();\r
-        int myBestTrainIdx = -1;\r
+    float myBestDistance = numeric_limits<float>::max();\r
+    int myBestTrainIdx = -1;\r
  \r
-        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
+    loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = myBestTrainIdx;\r
-            bestDistance[queryIdx] = myBestDistance;\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+        bestDistance[queryIdx] = myBestDistance;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
-                             cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                         const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+                         cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
-                                        int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
+                                    int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
  \r
-        loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
+    loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
  \r
-        float myBestDistance = numeric_limits<float>::max();\r
-        int myBestTrainIdx = -1;\r
-        int myBestImgIdx = -1;\r
+    float myBestDistance = numeric_limits<float>::max();\r
+    int myBestTrainIdx = -1;\r
+    int myBestImgIdx = -1;\r
  \r
-        Mask m = mask;\r
+    Mask m = mask;\r
  \r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
-        }\r
+    for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+    {\r
+        const DevMem2D_<T> train = trains[imgIdx];\r
+        m.next();\r
+        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = myBestTrainIdx;\r
-            bestImgIdx[queryIdx] = myBestImgIdx;\r
-            bestDistance[queryIdx] = myBestDistance;\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+        bestImgIdx[queryIdx] = myBestImgIdx;\r
+        bestDistance[queryIdx] = myBestDistance;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-                             cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+                         const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+                         cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match Unrolled\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                                 typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                                 float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+{\r
+    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
      {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
+        Dist dist;\r
  \r
-            #pragma unroll\r
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+        #pragma unroll\r
+        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+        {\r
+            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-                if (loadX < query.cols)\r
-                {\r
-                    T val;\r
+            if (loadX < query.cols)\r
+            {\r
+                T val;\r
  \r
-                    ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
  \r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
+                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+            }\r
  \r
-                __syncthreads();\r
+            __syncthreads();\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+            #pragma unroll\r
+            for (int j = 0; j < BLOCK_SIZE; ++j)\r
+                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-                __syncthreads();\r
-            }\r
+            __syncthreads();\r
+        }\r
  \r
-            typename Dist::result_type distVal = dist;\r
+        typename Dist::result_type distVal = dist;\r
  \r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
  \r
-            if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
-            {\r
-                bestImgIdx = imgIdx;\r
-                bestDistance = distVal;\r
-                bestTrainIdx = trainIdx;\r
-            }\r
+        if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+        {\r
+            bestImgIdx = imgIdx;\r
+            bestDistance = distVal;\r
+            bestTrainIdx = trainIdx;\r
          }\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
+__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        float myBestDistance = numeric_limits<float>::max();\r
-        int myBestTrainIdx = -1;\r
+    float myBestDistance = numeric_limits<float>::max();\r
+    int myBestTrainIdx = -1;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        \r
-        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    \r
+    loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = myBestTrainIdx;\r
-            bestDistance[queryIdx] = myBestDistance;\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+        bestDistance[queryIdx] = myBestDistance;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                       const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
-                       cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                   const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+                   cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
-                                  int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
+__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
+                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        float myBestDistance = numeric_limits<float>::max();\r
-        int myBestTrainIdx = -1;\r
-        int myBestImgIdx = -1;\r
+    float myBestDistance = numeric_limits<float>::max();\r
+    int myBestTrainIdx = -1;\r
+    int myBestImgIdx = -1;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        Mask m = mask;\r
-        \r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
-        }\r
+    Mask m = mask;\r
+    \r
+    for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+    {\r
+        const DevMem2D_<T> train = trains[imgIdx];\r
+        m.next();\r
+        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = myBestTrainIdx;\r
-            bestImgIdx[queryIdx] = myBestImgIdx;\r
-            bestDistance[queryIdx] = myBestDistance;\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+        bestImgIdx[queryIdx] = myBestImgIdx;\r
+        bestDistance[queryIdx] = myBestDistance;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                       const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-                       cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+                   const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+                   cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                         typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                         float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
+                     typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
+                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)\r
+{\r
+    for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
      {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
+        Dist dist;\r
  \r
-            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+        {\r
+            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-                if (loadX < query.cols)\r
-                {\r
-                    T val;\r
+            if (loadX < query.cols)\r
+            {\r
+                T val;\r
  \r
-                    ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+                ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
  \r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
+                ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+            }\r
  \r
-                __syncthreads();\r
+            __syncthreads();\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+            #pragma unroll\r
+            for (int j = 0; j < BLOCK_SIZE; ++j)\r
+                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-                __syncthreads();\r
-            }\r
+            __syncthreads();\r
+        }\r
  \r
-            typename Dist::result_type distVal = dist;\r
+        typename Dist::result_type distVal = dist;\r
  \r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
+        const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
  \r
-            if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
-            {\r
-                bestImgIdx = imgIdx;\r
-                bestDistance = distVal;\r
-                bestTrainIdx = trainIdx;\r
-            }\r
+        if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))\r
+        {\r
+            bestImgIdx = imgIdx;\r
+            bestDistance = distVal;\r
+            bestTrainIdx = trainIdx;\r
          }\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-    __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
+__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        float myBestDistance = numeric_limits<float>::max();\r
-        int myBestTrainIdx = -1;\r
+    float myBestDistance = numeric_limits<float>::max();\r
+    int myBestTrainIdx = -1;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        \r
-        loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    \r
+    loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = myBestTrainIdx;\r
-            bestDistance[queryIdx] = myBestDistance;\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+        bestDistance[queryIdx] = myBestDistance;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-               const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
-               cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+           const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+           cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-    __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
-                          int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
+__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, \r
+                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)\r
+{\r
+    extern __shared__ int smem[];\r
  \r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
+    const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
  \r
-        float myBestDistance = numeric_limits<float>::max();\r
-        int myBestTrainIdx = -1;\r
-        int myBestImgIdx = -1;\r
+    float myBestDistance = numeric_limits<float>::max();\r
+    int myBestTrainIdx = -1;\r
+    int myBestImgIdx = -1;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        Mask m = mask;\r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
-        }\r
+    Mask m = mask;\r
+    for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
+    {\r
+        const DevMem2D_<T> train = trains[imgIdx];\r
+        m.next();\r
+        loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);\r
+    }\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
+    float* s_distance = (float*)(smem);\r
+    int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
+    findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);\r
  \r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = myBestTrainIdx;\r
-            bestImgIdx[queryIdx] = myBestImgIdx;\r
-            bestDistance[queryIdx] = myBestDistance;\r
-        }\r
+    if (queryIdx < query.rows && threadIdx.x == 0)\r
+    {\r
+        bestTrainIdx[queryIdx] = myBestTrainIdx;\r
+        bestImgIdx[queryIdx] = myBestImgIdx;\r
+        bestDistance[queryIdx] = myBestDistance;\r
      }\r
+}\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-               const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-               cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+           cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
  \r
-        const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match dispatcher\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match dispatcher\r
  \r
-    template <typename Dist, typename T, typename Mask> \r
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                         const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
-                         int cc, cudaStream_t stream)\r
+template <typename Dist, typename T, typename Mask> \r
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+                     const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+                     int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
      {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, train, mask, trainIdx, distance, stream);\r
-        }\r
+        matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);\r
      }\r
-\r
-    template <typename Dist, typename T, typename Mask> \r
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                         const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-                         int cc, cudaStream_t stream)\r
+    else if (query.cols <= 128)\r
      {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
-        }\r
+        matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);\r
      }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match caller\r
-\r
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
-                                           const DevMem2Di& trainIdx, const DevMem2Df& distance,\r
-                                           int cc, cudaStream_t stream)\r
+    /*else if (query.cols <= 256)\r
      {\r
-        if (mask.data)\r
-        {\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
-                trainIdx, distance, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
-                trainIdx, distance, \r
-                cc, stream);\r
-        }\r
+        matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);\r
      }\r
-\r
-    template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
-                                           const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
-                                           int cc, cudaStream_t stream)\r
+    else if (query.cols <= 512)\r
+    {            \r
+        matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);\r
+    }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);\r
+    }*/\r
+    else\r
      {\r
-        if (mask.data)\r
-        {\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
-                trainIdx, distance, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
-                trainIdx, distance, \r
-                cc, stream);\r
-        }\r
+        match<16, Dist>(query, train, mask, trainIdx, distance, stream);\r
      }\r
+}\r
  \r
-    //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
-                                                const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
-                                                int cc, cudaStream_t stream)\r
+template <typename Dist, typename T, typename Mask> \r
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
+                     const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+                     int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
      {\r
-        if (mask.data)\r
-        {\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
-                trainIdx, distance, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
-                trainIdx, distance, \r
-                cc, stream);\r
-        }\r
+        matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
      }\r
-\r
-    template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-                                           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-                                           int cc, cudaStream_t stream)\r
+    else if (query.cols <= 128)\r
      {\r
-        if (masks.data)\r
-        {\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
-                trainIdx, imgIdx, distance, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
-                trainIdx, imgIdx, distance, \r
-                cc, stream);\r
-        }\r
+        matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+    }\r
+    /*else if (query.cols <= 256)\r
+    {\r
+        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+    }\r
+    else if (query.cols <= 512)\r
+    {            \r
+        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
      }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+    }*/\r
+    else\r
+    {\r
+        match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);\r
+    }\r
+}\r
  \r
-    template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match caller\r
  \r
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-                                           const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-                                           int cc, cudaStream_t stream)\r
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+                                       const DevMem2Di& trainIdx, const DevMem2Df& distance,\r
+                                       int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
      {\r
-        if (masks.data)\r
-        {\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
-                trainIdx, imgIdx, distance, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
-                trainIdx, imgIdx, distance, \r
-                cc, stream);\r
-        }\r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
+            trainIdx, distance, \r
+            cc, stream);\r
      }\r
+    else\r
+    {\r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
+            trainIdx, distance, \r
+            cc, stream);\r
+    }\r
+}\r
+\r
+template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+                                       const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+                                       int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+    {\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
+            trainIdx, distance, \r
+            cc, stream);\r
+    }\r
+    else\r
+    {\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
+            trainIdx, distance, \r
+            cc, stream);\r
+    }\r
+}\r
+\r
+//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, \r
+                                            const DevMem2Di& trainIdx, const DevMem2Df& distance, \r
+                                            int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+    {\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), \r
+            trainIdx, distance, \r
+            cc, stream);\r
+    }\r
+    else\r
+    {\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(), \r
+            trainIdx, distance, \r
+            cc, stream);\r
+    }\r
+}\r
  \r
-    //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
  \r
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-                                                const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
-                                                int cc, cudaStream_t stream)\r
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+                                       const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+                                       int cc, cudaStream_t stream)\r
+{\r
+    if (masks.data)\r
      {\r
-        if (masks.data)\r
-        {\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
-                trainIdx, imgIdx, distance, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
-                trainIdx, imgIdx, distance, \r
-                cc, stream);\r
-        }\r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
+            trainIdx, imgIdx, distance, \r
+            cc, stream);\r
+    }\r
+    else\r
+    {\r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
+            trainIdx, imgIdx, distance, \r
+            cc, stream);\r
      }\r
+}\r
+\r
+template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+                                       const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+                                       int cc, cudaStream_t stream)\r
+{\r
+    if (masks.data)\r
+    {\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
+            trainIdx, imgIdx, distance, \r
+            cc, stream);\r
+    }\r
+    else\r
+    {\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
+            trainIdx, imgIdx, distance, \r
+            cc, stream);\r
+    }\r
+}\r
+\r
+//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
+                                            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, \r
+                                            int cc, cudaStream_t stream)\r
+{\r
+    if (masks.data)\r
+    {\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), \r
+            trainIdx, imgIdx, distance, \r
+            cc, stream);\r
+    }\r
+    else\r
+    {\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), \r
+            trainIdx, imgIdx, distance, \r
+            cc, stream);\r
+    }\r
+}\r
+\r
+template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
+\r
+} // namespace bf_match\r
  \r
-    template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu

index e350075..519ed7f 100644 (file)
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -45,421 +45,423 @@
  #include "opencv2/gpu/device/vec_distance.hpp"\r
  #include "opencv2/gpu/device/datamov_utils.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace bf_radius_match\r
+namespace bf_radius_match {\r
+\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match Unrolled\r
+\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
+__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
+    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
  {\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled\r
+    #if __CUDA_ARCH__ >= 110\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
-        PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
-    {\r
-        #if __CUDA_ARCH__ >= 110\r
+    extern __shared__ int smem[];\r
+\r
+    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
  \r
-        extern __shared__ int smem[];\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+    Dist dist;\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    #pragma unroll\r
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+    {\r
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-        Dist dist;\r
+        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-        #pragma unroll\r
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
+        if (loadX < query.cols)\r
          {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+            T val;\r
  \r
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
  \r
-            if (loadX < query.cols)\r
-            {\r
-                T val;\r
+            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+        }\r
  \r
-                ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+        __syncthreads();\r
  \r
-                ForceGlob<T>::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-            }\r
+        #pragma unroll\r
+        for (int j = 0; j < BLOCK_SIZE; ++j)\r
+            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-            __syncthreads();\r
+        __syncthreads();\r
+    }\r
  \r
-            #pragma unroll\r
-            for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+    float distVal = (typename Dist::result_type)dist;\r
  \r
-            __syncthreads();\r
+    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
+    {\r
+        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
+        if (ind < maxCount)\r
+        {\r
+            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
+            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
+            bestDistance.ptr(queryIdx)[ind] = distVal;\r
          }\r
+    }\r
  \r
-        float distVal = (typename Dist::result_type)dist;\r
+    #endif\r
+}\r
  \r
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
-        {\r
-            unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
-            if (ind < maxCount)\r
-            {\r
-                bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
-                if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
-                bestDistance.ptr(queryIdx)[ind] = distVal;\r
-            }\r
-        }\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
  \r
-        #endif\r
-    }\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
+        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}   \r
  \r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
-            trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> \r
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }   \r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        cudaStream_t stream)\r
+    for (int i = 0; i < n; ++i)\r
      {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+        const DevMem2D_<T> train = trains[i];\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
  \r
-        for (int i = 0; i < n; ++i)\r
+        if (masks != 0 && masks[i].data)\r
          {\r
-            const DevMem2D_<T> train = trains[i];\r
-\r
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
-\r
-            if (masks != 0 && masks[i].data)\r
-            {\r
-                matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
-            }\r
-            else\r
-            {\r
-                matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
-            }\r
-            cudaSafeCall( cudaGetLastError() );\r
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
          }\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        else\r
+        {\r
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
+        }\r
+        cudaSafeCall( cudaGetLastError() );\r
      }\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
-    __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
-        PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
-    {\r
-        #if __CUDA_ARCH__ >= 110\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match\r
  \r
-        extern __shared__ int smem[];\r
+template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>\r
+__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,\r
+    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)\r
+{\r
+    #if __CUDA_ARCH__ >= 110\r
  \r
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
+    extern __shared__ int smem[];\r
  \r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
+    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
+    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
  \r
-        Dist dist;\r
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
  \r
-        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
-        {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
+    Dist dist;\r
+\r
+    for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
+    {\r
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
  \r
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
+        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
+        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
  \r
-            if (loadX < query.cols)\r
-            {\r
-                T val;\r
+        if (loadX < query.cols)\r
+        {\r
+            T val;\r
  \r
-                ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
+            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);\r
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
  \r
-                ForceGlob<T>::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-            }\r
+            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
+        }\r
  \r
-            __syncthreads();\r
+        __syncthreads();\r
  \r
-            #pragma unroll\r
-            for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
+        #pragma unroll\r
+        for (int j = 0; j < BLOCK_SIZE; ++j)\r
+            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
  \r
-            __syncthreads();\r
-        }\r
+        __syncthreads();\r
+    }\r
  \r
-        float distVal = (typename Dist::result_type)dist;\r
+    float distVal = (typename Dist::result_type)dist;\r
  \r
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
+    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)\r
+    {\r
+        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
+        if (ind < maxCount)\r
          {\r
-            unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
-            if (ind < maxCount)\r
-            {\r
-                bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
-                if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
-                bestDistance.ptr(queryIdx)[ind] = distVal;\r
-            }\r
+            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;\r
+            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;\r
+            bestDistance.ptr(queryIdx)[ind] = distVal;\r
          }\r
-\r
-        #endif\r
      }\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
+    #endif\r
+}\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
  \r
-        match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
-            trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, \r
+        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    template <int BLOCK_SIZE, typename Dist, typename T> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
+template <int BLOCK_SIZE, typename Dist, typename T> \r
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    cudaStream_t stream)\r
+{\r
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
  \r
-        for (int i = 0; i < n; ++i)\r
-        {\r
-            const DevMem2D_<T> train = trains[i];\r
-\r
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
-\r
-            if (masks != 0 && masks[i].data)\r
-            {\r
-                match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
-            }\r
-            else\r
-            {\r
-                match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
-            }\r
-            cudaSafeCall( cudaGetLastError() );\r
-        }\r
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    for (int i = 0; i < n; ++i)\r
+    {\r
+        const DevMem2D_<T> train = trains[i];\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match dispatcher\r
+        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
  \r
-    template <typename Dist, typename T, typename Mask> \r
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
-                         const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-                         int cc, cudaStream_t stream)\r
-    {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
+        if (masks != 0 && masks[i].data)\r
          {\r
-            matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), \r
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
          }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
-        }*/\r
          else\r
          {\r
-            match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), \r
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);\r
          }\r
+        cudaSafeCall( cudaGetLastError() );\r
      }\r
  \r
-    template <typename Dist, typename T> \r
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
-                         const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-                         int cc, cudaStream_t stream)\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Match dispatcher\r
+\r
+template <typename Dist, typename T, typename Mask> \r
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, \r
+                     const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+                     int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
      {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
-        }\r
-    } \r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Radius Match caller\r
-\r
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        int cc, cudaStream_t stream)\r
+        matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+    }\r
+    else if (query.cols <= 128)\r
      {\r
-        if (mask.data)\r
-        {\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
-                trainIdx, distance, nMatches, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
-                trainIdx, distance, nMatches, \r
-                cc, stream);\r
-        }\r
+        matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
      }\r
-\r
-    template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        int cc, cudaStream_t stream)\r
+    /*else if (query.cols <= 256)\r
      {\r
-        if (mask.data)\r
-        {\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
-                trainIdx, distance, nMatches, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
-                trainIdx, distance, nMatches, \r
-                cc, stream);\r
-        }\r
+        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
      }\r
+    else if (query.cols <= 512)\r
+    {            \r
+        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+    }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+    }*/\r
+    else\r
+    {\r
+        match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);\r
+    }\r
+}\r
  \r
-    //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        int cc, cudaStream_t stream)\r
+template <typename Dist, typename T> \r
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+                     const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+                     int cc, cudaStream_t stream)\r
+{\r
+    if (query.cols <= 64)\r
      {\r
-        if (mask.data)\r
-        {\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
-                trainIdx, distance, nMatches, \r
-                cc, stream);\r
-        }\r
-        else\r
-        {\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
-                trainIdx, distance, nMatches, \r
-                cc, stream);\r
-        }\r
+        matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
      }\r
+    else if (query.cols <= 128)\r
+    {\r
+        matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+    }\r
+    /*else if (query.cols <= 256)\r
+    {\r
+        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+    }\r
+    else if (query.cols <= 512)\r
+    {            \r
+        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+    }\r
+    else if (query.cols <= 1024)\r
+    {            \r
+        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+    }*/\r
+    else\r
+    {\r
+        match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);\r
+    }\r
+} \r
  \r
-    template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Radius Match caller\r
  \r
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        int cc, cudaStream_t stream)\r
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
      {\r
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
-            trainIdx, imgIdx, distance, nMatches, \r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
+            trainIdx, distance, nMatches, \r
              cc, stream);\r
      }\r
-\r
-    template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        int cc, cudaStream_t stream)\r
+    else\r
      {\r
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
-            trainIdx, imgIdx, distance, nMatches, \r
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
+            trainIdx, distance, nMatches, \r
              cc, stream);\r
      }\r
-\r
-    //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
-        int cc, cudaStream_t stream)\r
+}\r
+\r
+template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+    {\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
+            trainIdx, distance, nMatches, \r
+            cc, stream);\r
+    }\r
+    else\r
+    {\r
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
+            trainIdx, distance, nMatches, \r
+            cc, stream);\r
+    }\r
+}\r
+\r
+//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, \r
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+    {\r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), \r
+            trainIdx, distance, nMatches, \r
+            cc, stream);\r
+    }\r
+    else\r
      {\r
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
-            trainIdx, imgIdx, distance, nMatches, \r
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), \r
+            trainIdx, distance, nMatches, \r
              cc, stream);\r
      }\r
+}\r
+\r
+template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
+        trainIdx, imgIdx, distance, nMatches, \r
+        cc, stream);\r
+}\r
+\r
+template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
+        trainIdx, imgIdx, distance, nMatches, \r
+        cc, stream);\r
+}\r
+\r
+//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, \r
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, \r
+    int cc, cudaStream_t stream)\r
+{\r
+    matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, \r
+        trainIdx, imgIdx, distance, nMatches, \r
+        cc, stream);\r
+}\r
+\r
+template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
+\r
+} // namespace bf_radius_match\r
  \r
-    template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu

index 173c156..4d3d9bc 100644 (file)
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,191 +43,186 @@
  #include "internal_shared.hpp"\r
  #include "opencv2/gpu/device/limits.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace bf_krnls\r
+namespace bilateral_filter {\r
+\r
+__constant__ float* ctable_color;\r
+__constant__ float* ctable_space;\r
+__constant__ size_t ctable_space_step;\r
+\r
+__constant__ int cndisp;\r
+__constant__ int cradius;\r
+\r
+__constant__ short cedge_disc;\r
+__constant__ short cmax_disc;\r
+\r
+void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)\r
  {\r
-    __constant__ float* ctable_color;\r
-    __constant__ float* ctable_space;\r
-    __constant__ size_t ctable_space_step;\r
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );\r
+    size_t table_space_step = table_space.step / sizeof(float);\r
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );\r
  \r
-    __constant__ int cndisp;\r
-    __constant__ int cradius;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );\r
  \r
-    __constant__ short cedge_disc;\r
-    __constant__ short cmax_disc;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );\r
  }\r
  \r
-namespace cv { namespace gpu { namespace bf \r
+template <int channels>\r
+struct DistRgbMax\r
  {\r
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)\r
+    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
      {\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );\r
-        size_t table_space_step = table_space.step / sizeof(float);\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );\r
-\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );\r
-\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );\r
+        uchar x = ::abs(a[0] - b[0]);\r
+        uchar y = ::abs(a[1] - b[1]);\r
+        uchar z = ::abs(a[2] - b[2]);\r
+        return (::max(::max(x, y), z));\r
      }\r
-}}}\r
+};\r
  \r
-namespace bf_krnls\r
+template <>\r
+struct DistRgbMax<1>\r
  {\r
-    template <int channels>\r
-    struct DistRgbMax\r
+    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
      {\r
-        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
-        {\r
-            uchar x = abs(a[0] - b[0]);\r
-            uchar y = abs(a[1] - b[1]);\r
-            uchar z = abs(a[2] - b[2]);\r
-            return (max(max(x, y), z));\r
-        }\r
-    };\r
+        return ::abs(a[0] - b[0]);\r
+    }\r
+};\r
  \r
-    template <>\r
-    struct DistRgbMax<1>\r
-    {\r
-        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)\r
-        {\r
-            return abs(a[0] - b[0]);\r
-        }\r
-    };\r
+template <int channels, typename T>\r
+__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)\r
+{\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
  \r
-    template <int channels, typename T>\r
-    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)\r
-    {\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
+    T dp[5];\r
  \r
-        T dp[5];\r
+    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
+    {\r
+        dp[0] = *(disp + (y  ) * disp_step + x + 0);\r
+        dp[1] = *(disp + (y-1) * disp_step + x + 0);\r
+        dp[2] = *(disp + (y  ) * disp_step + x - 1);\r
+        dp[3] = *(disp + (y+1) * disp_step + x + 0);\r
+        dp[4] = *(disp + (y  ) * disp_step + x + 1);\r
  \r
-        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
+        if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            \r
          {\r
-            dp[0] = *(disp + (y  ) * disp_step + x + 0);\r
-            dp[1] = *(disp + (y-1) * disp_step + x + 0);\r
-            dp[2] = *(disp + (y  ) * disp_step + x - 1);\r
-            dp[3] = *(disp + (y+1) * disp_step + x + 0);\r
-            dp[4] = *(disp + (y  ) * disp_step + x + 1);\r
+            const int ymin = ::max(0, y - cradius);\r
+            const int xmin = ::max(0, x - cradius);\r
+            const int ymax = ::min(h - 1, y + cradius);\r
+            const int xmax = ::min(w - 1, x + cradius);\r
  \r
-            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            \r
-            {\r
-                const int ymin = max(0, y - cradius);\r
-                const int xmin = max(0, x - cradius);\r
-                const int ymax = min(h - 1, y + cradius);\r
-                const int xmax = min(w - 1, x + cradius);\r
+            float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};\r
  \r
-                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};\r
+            const uchar* ic = img + y * img_step + channels * x;\r
  \r
-                const uchar* ic = img + y * img_step + channels * x;\r
+            for(int yi = ymin; yi <= ymax; yi++)\r
+            {\r
+                const T* disp_y = disp + yi * disp_step;\r
  \r
-                for(int yi = ymin; yi <= ymax; yi++)\r
+                for(int xi = xmin; xi <= xmax; xi++)\r
                  {\r
-                    const T* disp_y = disp + yi * disp_step;\r
-\r
-                    for(int xi = xmin; xi <= xmax; xi++)\r
-                    {\r
-                        const uchar* in = img + yi * img_step + channels * xi;\r
+                    const uchar* in = img + yi * img_step + channels * xi;\r
  \r
-                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);\r
+                    uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);\r
  \r
-                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];\r
+                    const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];\r
  \r
-                        const T disp_reg = disp_y[xi];\r
+                    const T disp_reg = disp_y[xi];\r
  \r
-                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;\r
-                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;\r
-                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;\r
-                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;\r
-                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;\r
-                    }\r
+                    cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;\r
+                    cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;\r
+                    cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;\r
+                    cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;\r
+                    cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;\r
                  }\r
+            }\r
  \r
-                float minimum = numeric_limits<float>::max();\r
-                int id = 0;\r
+            float minimum = numeric_limits<float>::max();\r
+            int id = 0;\r
  \r
-                if (cost[0] < minimum)\r
-                {\r
-                    minimum = cost[0];\r
-                    id = 0;\r
-                }\r
-                if (cost[1] < minimum)\r
-                {\r
-                    minimum = cost[1];\r
-                    id = 1;\r
-                }\r
-                if (cost[2] < minimum)\r
-                {\r
-                    minimum = cost[2];\r
-                    id = 2;\r
-                }\r
-                if (cost[3] < minimum)\r
-                {\r
-                    minimum = cost[3];\r
-                    id = 3;\r
-                }\r
-                if (cost[4] < minimum)\r
-                {\r
-                    minimum = cost[4];\r
-                    id = 4;\r
-                }\r
-\r
-                *(disp + y * disp_step + x) = dp[id];\r
+            if (cost[0] < minimum)\r
+            {\r
+                minimum = cost[0];\r
+                id = 0;\r
              }\r
+            if (cost[1] < minimum)\r
+            {\r
+                minimum = cost[1];\r
+                id = 1;\r
+            }\r
+            if (cost[2] < minimum)\r
+            {\r
+                minimum = cost[2];\r
+                id = 2;\r
+            }\r
+            if (cost[3] < minimum)\r
+            {\r
+                minimum = cost[3];\r
+                id = 3;\r
+            }\r
+            if (cost[4] < minimum)\r
+            {\r
+                minimum = cost[4];\r
+                id = 4;\r
+            }\r
+\r
+            *(disp + y * disp_step + x) = dp[id];\r
          }\r
      }\r
  }\r
  \r
-namespace cv { namespace gpu { namespace bf \r
+template <typename T>     \r
+void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
  {\r
-    template <typename T>     \r
-    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
-        grid.x = divUp(disp.cols, threads.x << 1);\r
-        grid.y = divUp(disp.rows, threads.y);\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
+    grid.x = divUp(disp.cols, threads.x << 1);\r
+    grid.y = divUp(disp.rows, threads.y);\r
  \r
-        switch (channels)\r
+    switch (channels)\r
+    {\r
+    case 1:\r
+        for (int i = 0; i < iters; ++i)\r
          {\r
-        case 1:\r
-            for (int i = 0; i < iters; ++i)\r
-            {\r
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
-                cudaSafeCall( cudaGetLastError() );\r
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
-                cudaSafeCall( cudaGetLastError() );\r
-            }\r
-            break;\r
-        case 3:\r
-            for (int i = 0; i < iters; ++i)\r
-            {\r
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
-                cudaSafeCall( cudaGetLastError() );\r
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
-                cudaSafeCall( cudaGetLastError() );\r
-            }\r
-            break;\r
-        default:\r
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
+            bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+            cudaSafeCall( cudaGetLastError() );\r
+\r
+            bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+            cudaSafeCall( cudaGetLastError() );\r
          }\r
+        break;\r
+    case 3:\r
+        for (int i = 0; i < iters; ++i)\r
+        {\r
+            bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+            cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream != 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+            bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);\r
+            cudaSafeCall( cudaGetLastError() );\r
+        }\r
+        break;\r
+    default:\r
+        cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
      }\r
  \r
-    void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)\r
-    {\r
-        bilateral_filter_caller(disp, img, channels, iters, stream);\r
-    }\r
+    if (stream != 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)\r
-    {\r
-        bilateral_filter_caller(disp, img, channels, iters, stream);\r
-    }\r
-}}}\r
+void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
+{\r
+    bilateral_filter_caller(disp, img, channels, iters, stream);\r
+}\r
+\r
+void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)\r
+{\r
+    bilateral_filter_caller(disp, img, channels, iters, stream);\r
+}\r
+\r
+} // namespace bilateral_filter\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu

index 4b29a70..fca1b96 100644 (file)
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -42,81 +42,77 @@
  \r
  #include "internal_shared.hpp"\r
  \r
-using namespace cv::gpu;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu \r
+namespace blend {\r
+\r
+template <typename T>\r
+__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,\r
+                                  const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)\r
  {\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-    template <typename T>\r
-    __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,\r
-                                      const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)\r
+    if (y < rows && x < cols)\r
      {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        int x_ = x / cn;\r
+        float w1 = weights1.ptr(y)[x_];\r
+        float w2 = weights2.ptr(y)[x_];\r
+        T p1 = img1.ptr(y)[x];\r
+        T p2 = img2.ptr(y)[x];\r
+        result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);\r
+    }\r
+}      \r
  \r
-        if (y < rows && x < cols)\r
-        {\r
-            int x_ = x / cn;\r
-            float w1 = weights1.ptr(y)[x_];\r
-            float w2 = weights2.ptr(y)[x_];\r
-            T p1 = img1.ptr(y)[x];\r
-            T p2 = img2.ptr(y)[x];\r
-            result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);\r
-        }\r
-    }  \r
+template <typename T>\r
+void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)\r
+{\r
+    dim3 threads(16, 16);\r
+    dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));\r
+    \r
+    blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    template <typename T>\r
-    void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, \r
-                           const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream)\r
-    {\r
-        dim3 threads(16, 16);\r
-        dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));\r
-        \r
-        blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);\r
+template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);\r
  \r
-    template void blendLinearCaller<uchar>(int, int, int, const PtrStep<uchar>&, const PtrStep<uchar>&, \r
-                                           const PtrStepf&, const PtrStepf&, PtrStep<uchar>, cudaStream_t stream);\r
-    template void blendLinearCaller<float>(int, int, int, const PtrStep<float>&, const PtrStep<float>&, \r
-                                           const PtrStepf&, const PtrStepf&, PtrStep<float>, cudaStream_t stream);\r
  \r
+__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,\r
+                                      const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-    __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,\r
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)\r
+    if (y < rows && x < cols)\r
      {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        if (y < rows && x < cols)\r
-        {\r
-            float w1 = weights1.ptr(y)[x];\r
-            float w2 = weights2.ptr(y)[x];\r
-            float sum_inv = 1.f / (w1 + w2 + 1e-5f);\r
-            w1 *= sum_inv;\r
-            w2 *= sum_inv;\r
-            uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];\r
-            uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];\r
-            ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,\r
-                                                      p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);\r
-        }\r
+        float w1 = weights1.ptr(y)[x];\r
+        float w2 = weights2.ptr(y)[x];\r
+        float sum_inv = 1.f / (w1 + w2 + 1e-5f);\r
+        w1 *= sum_inv;\r
+        w2 *= sum_inv;\r
+        uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];\r
+        uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];\r
+        ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,\r
+                                                  p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);\r
      }\r
+}\r
  \r
+void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)\r
+{\r
+    dim3 threads(16, 16);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+    \r
+    blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, \r
-                               const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream)\r
-    {\r
-        dim3 threads(16, 16);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
-        \r
-        blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+} // namespace blend \r
  \r
-}}
-\ No newline at end of file
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu

index 2a30393..1cdf191 100644 (file)
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -44,153 +44,149 @@
  #include "opencv2/gpu/device/transform.hpp"
  #include "opencv2/gpu/device/functional.hpp"
  
-#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
+BEGIN_OPENCV_DEVICE_NAMESPACE
  
-using namespace cv::gpu::device;
+#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
  
-namespace cv { namespace gpu
+namespace transform_points
  {
-    namespace transform_points
-    {
-        __constant__ float3 crot0;
-        __constant__ float3 crot1;
-        __constant__ float3 crot2;
-        __constant__ float3 ctransl;
+    __constant__ float3 crot0;
+    __constant__ float3 crot1;
+    __constant__ float3 crot2;
+    __constant__ float3 ctransl;
  
-        struct TransformOp : unary_function<float3, float3>
-        {
-            __device__ __forceinline__ float3 operator()(const float3& p) const
-            {
-                return make_float3(
-                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-            }
-        };
-
-        void call(const DevMem2D_<float3> src, const float* rot,
-                  const float* transl, DevMem2D_<float3> dst,
-                  cudaStream_t stream)
+    struct TransformOp : unary_function<float3, float3>
+    {
+        __device__ __forceinline__ float3 operator()(const float3& p) const
          {
-            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            transform(src, dst, TransformOp(), stream);
+            return make_float3(
+                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
          }
-    } // namespace transform_points
-
+    };
  
-    namespace project_points
+    void call(const DevMem2D_<float3> src, const float* rot,
+              const float* transl, DevMem2D_<float3> dst,
+              cudaStream_t stream)
      {
-        __constant__ float3 crot0;
-        __constant__ float3 crot1;
-        __constant__ float3 crot2;
-        __constant__ float3 ctransl;
-        __constant__ float3 cproj0;
-        __constant__ float3 cproj1;
-
-        struct ProjectOp : unary_function<float3, float3>
-        {
-            __device__ __forceinline__ float2 operator()(const float3& p) const
-            {
-                // Rotate and translate in 3D
-                float3 t = make_float3(
-                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-                // Project on 2D plane
-                return make_float2(
-                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
-                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
-            }
-        };
-
-        void call(const DevMem2D_<float3> src, const float* rot,
-                  const float* transl, const float* proj, DevMem2D_<float2> dst,
-                  cudaStream_t stream)
+        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
+    }
+} // namespace transform_points
+
+namespace project_points
+{
+    __constant__ float3 crot0;
+    __constant__ float3 crot1;
+    __constant__ float3 crot2;
+    __constant__ float3 ctransl;
+    __constant__ float3 cproj0;
+    __constant__ float3 cproj1;
+
+    struct ProjectOp : unary_function<float3, float3>
+    {
+        __device__ __forceinline__ float2 operator()(const float3& p) const
          {
-            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-            transform(src, dst, ProjectOp(), stream);
+            // Rotate and translate in 3D
+            float3 t = make_float3(
+                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            // Project on 2D plane
+            return make_float2(
+                    (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+                    (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
          }
-    } // namespace project_points
+    };
  
+    void call(const DevMem2D_<float3> src, const float* rot,
+              const float* transl, const float* proj, DevMem2D_<float2> dst,
+              cudaStream_t stream)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
+    }
+} // namespace project_points
+
+namespace solve_pnp_ransac
+{
+    __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+    __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
  
-    namespace solve_pnp_ransac
+    int maxNumIters()
      {
-        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
-        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
+        return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
+    }
  
-        int maxNumIters()
-        {
-            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
-        }
+    __device__ __forceinline__ float sqr(float x)
+    {
+        return x * x;
+    }
+
+    __global__ void computeHypothesisScoresKernel(
+            const int num_points, const float3* object, const float2* image,
+            const float dist_threshold, int* g_num_inliers)
+    {
+        const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+        const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+        int num_inliers = 0;
  
-        __device__ __forceinline__ float sqr(float x)
+        for (int i = threadIdx.x; i < num_points; i += blockDim.x)
          {
-            return x * x;
+            float3 p = object[i];
+            p = make_float3(
+                    rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                    rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                    rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+            p.x /= p.z;
+            p.y /= p.z;
+            float2 image_p = image[i];
+            if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                ++num_inliers;
          }
  
-        __global__ void computeHypothesisScoresKernel(
-                const int num_points, const float3* object, const float2* image,
-                const float dist_threshold, int* g_num_inliers)
+        extern __shared__ float s_num_inliers[];
+        s_num_inliers[threadIdx.x] = num_inliers;
+        __syncthreads();
+
+        for (int step = blockDim.x / 2; step > 0; step >>= 1)
          {
-            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
-            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
-            int num_inliers = 0;
-
-            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
-            {
-                float3 p = object[i];
-                p = make_float3(
-                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
-                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
-                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
-                p.x /= p.z;
-                p.y /= p.z;
-                float2 image_p = image[i];
-                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
-                    ++num_inliers;
-            }
-
-            extern __shared__ float s_num_inliers[];
-            s_num_inliers[threadIdx.x] = num_inliers;
+            if (threadIdx.x < step)
+                s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
              __syncthreads();
-
-            for (int step = blockDim.x / 2; step > 0; step >>= 1)
-            {
-                if (threadIdx.x < step)
-                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
-                __syncthreads();
-            }
-
-            if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = s_num_inliers[0];
          }
  
-        void computeHypothesisScores(
-                const int num_hypotheses, const int num_points, const float* rot_matrices,
-                const float3* transl_vectors, const float3* object, const float2* image,
-                const float dist_threshold, int* hypothesis_scores)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+        if (threadIdx.x == 0)
+            g_num_inliers[blockIdx.x] = s_num_inliers[0];
+    }
  
-            dim3 threads(256);
-            dim3 grid(num_hypotheses);
-            int smem_size = threads.x * sizeof(float);
+    void computeHypothesisScores(
+            const int num_hypotheses, const int num_points, const float* rot_matrices,
+            const float3* transl_vectors, const float3* object, const float2* image,
+            const float dist_threshold, int* hypothesis_scores)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+        cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
  
-            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
-                    num_points, object, image, dist_threshold, hypothesis_scores);
-            cudaSafeCall( cudaGetLastError() );
+        dim3 threads(256);
+        dim3 grid(num_hypotheses);
+        int smem_size = threads.x * sizeof(float);
  
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace solvepnp_ransac
+        computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+                num_points, object, image, dist_threshold, hypothesis_scores);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+} // namespace solvepnp_ransac
  
-}} // namespace cv { namespace gpu
+END_OPENCV_DEVICE_NAMESPACE
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu

index 4ea26fc..0c0f5d9 100644 (file)
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -44,339 +44,370 @@
  #include <algorithm>\r
  #include "internal_shared.hpp"\r
  \r
-using namespace cv::gpu;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace canny\r
+namespace canny {\r
+\r
+__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
  {\r
-    __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
-    {\r
-        __shared__ int smem[16][18];\r
+    __shared__ int smem[16][18];\r
  \r
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
+    const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (i < rows)\r
+    if (i < rows)\r
+    {\r
+        smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];\r
+        if (threadIdx.x == 0)\r
          {\r
-            smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];\r
-            if (threadIdx.x == 0)\r
-            {\r
-                smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)];\r
-                smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)];\r
-            }\r
-            __syncthreads();\r
+            smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];\r
+            smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];\r
+        }\r
+        __syncthreads();\r
  \r
-            if (j < cols)\r
-            {\r
-                dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];\r
-                dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];\r
-            }\r
+        if (j < cols)\r
+        {\r
+            dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];\r
+            dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];\r
          }\r
      }\r
+}\r
  \r
-    void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
-    {\r
-        dim3 block(16, 16, 1);\r
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)\r
+{\r
+    dim3 block(16, 16, 1);\r
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
  \r
-        calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall(cudaThreadSynchronize());\r
-    }\r
+    cudaSafeCall(cudaThreadSynchronize());\r
+}\r
  \r
-    struct L1\r
+struct L1\r
+{\r
+    static __device__ __forceinline__ float calc(int x, int y)\r
      {\r
-        static __device__ __forceinline__ float calc(int x, int y)\r
-        {\r
-            return abs(x) + abs(y);\r
-        }\r
-    };\r
-    struct L2\r
+        return ::abs(x) + ::abs(y);\r
+    }\r
+};\r
+struct L2\r
+{\r
+    static __device__ __forceinline__ float calc(int x, int y)\r
      {\r
-        static __device__ __forceinline__ float calc(int x, int y)\r
-        {\r
-            return sqrtf(x * x + y * y);\r
-        }\r
-    };\r
+        return ::sqrtf(x * x + y * y);\r
+    }\r
+};\r
  \r
-    template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, \r
-        PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
-    {\r
-        __shared__ int sdx[18][16];\r
-        __shared__ int sdy[18][16];\r
+template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, \r
+    PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
+{\r
+    __shared__ int sdx[18][16];\r
+    __shared__ int sdy[18][16];\r
  \r
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
+    const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (j < cols)\r
+    if (j < cols)\r
+    {\r
+        sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];\r
+        sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];\r
+        if (threadIdx.y == 0)\r
          {\r
-            sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];\r
-            sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];\r
-            if (threadIdx.y == 0)\r
-            {\r
-                sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j];\r
-                sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j];\r
+            sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];\r
+            sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];\r
  \r
-                sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j];\r
-                sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j];\r
-            }\r
-            __syncthreads();\r
+            sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];\r
+            sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];\r
+        }\r
+        __syncthreads();\r
  \r
-            if (i < rows)\r
-            {\r
-                int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];\r
-                int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];\r
+        if (i < rows)\r
+        {\r
+            int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];\r
+            int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];\r
  \r
-                dx.ptr(i)[j] = x;\r
-                dy.ptr(i)[j] = y;\r
+            dx.ptr(i)[j] = x;\r
+            dy.ptr(i)[j] = y;\r
  \r
-                mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);\r
-            }\r
+            mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);\r
          }\r
      }\r
+}\r
  \r
-    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
-    {\r
-        dim3 block(16, 16, 1);\r
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
+{\r
+    dim3 block(16, 16, 1);\r
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
  \r
-        if (L2Grad)\r
-            calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
-        else\r
-            calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
+    if (L2Grad)\r
+        calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
+    else\r
+        calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall(cudaThreadSynchronize());\r
-    }\r
+    cudaSafeCall(cudaThreadSynchronize());\r
+}\r
  \r
-    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
-    {\r
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)\r
+{\r
+    const int j = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int i = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (i < rows && j < cols)\r
-            mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);\r
-    }\r
+    if (i < rows && j < cols)\r
+        mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);\r
+}\r
  \r
-    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
-    {\r
-        dim3 block(16, 16, 1);\r
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)\r
+{\r
+    dim3 block(16, 16, 1);\r
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
  \r
-        if (L2Grad)\r
-            calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);\r
-        else\r
-            calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);\r
+    if (L2Grad)\r
+        calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);\r
+    else\r
+        calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall(cudaThreadSynchronize());\r
-    }\r
+    cudaSafeCall(cudaThreadSynchronize());\r
+}\r
  \r
  //////////////////////////////////////////////////////////////////////////////////////////\r
-        \r
+    \r
  #define CANNY_SHIFT 15\r
  #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)\r
  \r
-    __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
-    {\r
-        __shared__ float smem[18][18];\r
+__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
+{\r
+    __shared__ float smem[18][18];\r
  \r
-        const int j = blockIdx.x * 16 + threadIdx.x;\r
-        const int i = blockIdx.y * 16 + threadIdx.y;\r
+    const int j = blockIdx.x * 16 + threadIdx.x;\r
+    const int i = blockIdx.y * 16 + threadIdx.y;\r
  \r
-        const int tid = threadIdx.y * 16 + threadIdx.x;\r
-        const int lx = tid % 18;\r
-        const int ly = tid / 18;\r
+    const int tid = threadIdx.y * 16 + threadIdx.x;\r
+    const int lx = tid % 18;\r
+    const int ly = tid / 18;\r
  \r
-        if (ly < 14)\r
-            smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
+    if (ly < 14)\r
+        smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
  \r
-        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
-            smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
+    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
+        smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
  \r
-        __syncthreads();\r
+    __syncthreads();\r
  \r
-        if (i < rows && j < cols)\r
-        {\r
-            int x = dx.ptr(i)[j];\r
-            int y = dy.ptr(i)[j];\r
-            const int s = (x ^ y) < 0 ? -1 : 1;\r
-            const float m = smem[threadIdx.y + 1][threadIdx.x + 1];\r
+    if (i < rows && j < cols)\r
+    {\r
+        int x = dx.ptr(i)[j];\r
+        int y = dy.ptr(i)[j];\r
+        const int s = (x ^ y) < 0 ? -1 : 1;\r
+        const float m = smem[threadIdx.y + 1][threadIdx.x + 1];\r
  \r
-            x = abs(x);\r
-            y = abs(y);\r
+        x = ::abs(x);\r
+        y = ::abs(y);\r
  \r
-            // 0 - the pixel can not belong to an edge\r
-            // 1 - the pixel might belong to an edge\r
-            // 2 - the pixel does belong to an edge\r
-            int edge_type = 0;\r
+        // 0 - the pixel can not belong to an edge\r
+        // 1 - the pixel might belong to an edge\r
+        // 2 - the pixel does belong to an edge\r
+        int edge_type = 0;\r
  \r
-            if (m > low_thresh)\r
-            {\r
-                const int tg22x = x * TG22;\r
-                const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);\r
+        if (m > low_thresh)\r
+        {\r
+            const int tg22x = x * TG22;\r
+            const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);\r
  \r
-                y <<= CANNY_SHIFT;\r
+            y <<= CANNY_SHIFT;\r
  \r
-                if (y < tg22x)\r
-                {\r
-                    if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])\r
-                        edge_type = 1 + (int)(m > high_thresh);\r
-                }\r
-                else if( y > tg67x )\r
-                {\r
-                    if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])\r
-                        edge_type = 1 + (int)(m > high_thresh);\r
-                }\r
-                else\r
-                {\r
-                    if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])\r
-                        edge_type = 1 + (int)(m > high_thresh);\r
-                }\r
+            if (y < tg22x)\r
+            {\r
+                if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])\r
+                    edge_type = 1 + (int)(m > high_thresh);\r
+            }\r
+            else if( y > tg67x )\r
+            {\r
+                if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])\r
+                    edge_type = 1 + (int)(m > high_thresh);\r
+            }\r
+            else\r
+            {\r
+                if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])\r
+                    edge_type = 1 + (int)(m > high_thresh);\r
              }\r
-            \r
-            map.ptr(i + 1)[j + 1] = edge_type;\r
          }\r
+        \r
+        map.ptr(i + 1)[j + 1] = edge_type;\r
      }\r
+}\r
  \r
  #undef CANNY_SHIFT\r
  #undef TG22\r
  \r
-    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
-    {\r
-        dim3 block(16, 16, 1);\r
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)\r
+{\r
+    dim3 block(16, 16, 1);\r
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
  \r
-        calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall(cudaThreadSynchronize());\r
-    }\r
+    cudaSafeCall(cudaThreadSynchronize());\r
+}\r
  \r
  //////////////////////////////////////////////////////////////////////////////////////////\r
  \r
-    __device__ unsigned int counter = 0;\r
-\r
-    __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)\r
-    {\r
-        #if __CUDA_ARCH__ >= 120\r
-\r
-        __shared__ int smem[18][18];\r
+__device__ unsigned int counter = 0;\r
  \r
-        const int j = blockIdx.x * 16 + threadIdx.x;\r
-        const int i = blockIdx.y * 16 + threadIdx.y;\r
+__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)\r
+{\r
+    #if __CUDA_ARCH__ >= 120\r
  \r
-        const int tid = threadIdx.y * 16 + threadIdx.x;\r
-        const int lx = tid % 18;\r
-        const int ly = tid / 18; \r
+    __shared__ int smem[18][18];\r
  \r
-        if (ly < 14)\r
-            smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
+    const int j = blockIdx.x * 16 + threadIdx.x;\r
+    const int i = blockIdx.y * 16 + threadIdx.y;\r
  \r
-        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
-            smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
+    const int tid = threadIdx.y * 16 + threadIdx.x;\r
+    const int lx = tid % 18;\r
+    const int ly = tid / 18; \r
  \r
-        __syncthreads();\r
+    if (ly < 14)\r
+        smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];\r
  \r
-        if (i < rows && j < cols)\r
-        {\r
-            int n;\r
+    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)\r
+        smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];\r
  \r
-            #pragma unroll\r
-            for (int k = 0; k < 16; ++k)\r
-            {\r
-                n = 0;\r
+    __syncthreads();\r
  \r
-                if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)\r
-                {\r
-                    n += smem[threadIdx.y    ][threadIdx.x    ] == 2;\r
-                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;\r
-                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;\r
-                    \r
-                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;\r
-                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;\r
-                    \r
-                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;\r
-                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;\r
-                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;\r
-                }\r
-\r
-                if (n > 0)\r
-                    smem[threadIdx.y + 1][threadIdx.x + 1] = 2;\r
-            }\r
-\r
-            const int e = smem[threadIdx.y + 1][threadIdx.x + 1];\r
-\r
-            map.ptr(i + 1)[j + 1] = e;\r
+    if (i < rows && j < cols)\r
+    {\r
+        int n;\r
  \r
+        #pragma unroll\r
+        for (int k = 0; k < 16; ++k)\r
+        {\r
              n = 0;\r
  \r
-            if (e == 2)\r
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)\r
              {\r
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 1;\r
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;\r
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;\r
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;\r
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;\r
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;\r
                  \r
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;\r
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;\r
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;\r
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;\r
                  \r
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;\r
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;\r
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;\r
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;\r
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;\r
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;\r
              }\r
  \r
              if (n > 0)\r
-            {\r
-                const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));\r
-                st[ind] = make_ushort2(j + 1, i + 1);\r
-            }\r
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;\r
          }\r
  \r
-        #endif\r
-    }\r
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];\r
  \r
-    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)\r
-    {\r
-        dim3 block(16, 16, 1);\r
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+        map.ptr(i + 1)[j + 1] = e;\r
  \r
-        edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+        n = 0;\r
  \r
-        cudaSafeCall(cudaThreadSynchronize());\r
+        if (e == 2)\r
+        {\r
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;\r
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;\r
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;\r
+            \r
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;\r
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;\r
+            \r
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;\r
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;\r
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;\r
+        }\r
+\r
+        if (n > 0)\r
+        {\r
+            const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));\r
+            st[ind] = make_ushort2(j + 1, i + 1);\r
+        }\r
      }\r
  \r
-    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};\r
-    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};\r
+    #endif\r
+}\r
  \r
-    __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)\r
-    {\r
-        #if __CUDA_ARCH__ >= 120\r
+void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)\r
+{\r
+    dim3 block(16, 16, 1);\r
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
  \r
-        const int stack_size = 512;\r
-        \r
-        __shared__ unsigned int s_counter;\r
-        __shared__ unsigned int s_ind;\r
-        __shared__ ushort2 s_st[stack_size];\r
+    edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (threadIdx.x == 0)\r
-            s_counter = 0;\r
-        __syncthreads();\r
+    cudaSafeCall(cudaThreadSynchronize());\r
+}\r
+\r
+__constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};\r
+__constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};\r
+\r
+__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)\r
+{\r
+    #if __CUDA_ARCH__ >= 120\r
+\r
+    const int stack_size = 512;\r
+    \r
+    __shared__ unsigned int s_counter;\r
+    __shared__ unsigned int s_ind;\r
+    __shared__ ushort2 s_st[stack_size];\r
+\r
+    if (threadIdx.x == 0)\r
+        s_counter = 0;\r
+    __syncthreads();\r
  \r
-        int ind = blockIdx.y * gridDim.x + blockIdx.x;\r
+    int ind = blockIdx.y * gridDim.x + blockIdx.x;\r
  \r
-        if (ind < count)\r
+    if (ind < count)\r
+    {\r
+        ushort2 pos = st1[ind];\r
+\r
+        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
          {\r
-            ushort2 pos = st1[ind];\r
+            if (threadIdx.x < 8)\r
+            {\r
+                pos.x += c_dx[threadIdx.x];\r
+                pos.y += c_dy[threadIdx.x];\r
+\r
+                if (map.ptr(pos.y)[pos.x] == 1)\r
+                {\r
+                    map.ptr(pos.y)[pos.x] = 2;\r
+\r
+                    ind = atomicInc(&s_counter, (unsigned int)(-1));\r
+\r
+                    s_st[ind] = pos;\r
+                }\r
+            }\r
+            __syncthreads();\r
  \r
-            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
+            while (s_counter > 0 && s_counter <= stack_size - blockDim.x)\r
              {\r
-                if (threadIdx.x < 8)\r
+                const int subTaskIdx = threadIdx.x >> 3;\r
+                const int portion = ::min(s_counter, blockDim.x >> 3);\r
+\r
+                pos.x = pos.y = 0;\r
+\r
+                if (subTaskIdx < portion)\r
+                    pos = s_st[s_counter - 1 - subTaskIdx];\r
+                __syncthreads();\r
+                    \r
+                if (threadIdx.x == 0)\r
+                    s_counter -= portion;\r
+                __syncthreads();\r
+                 \r
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
                  {\r
-                    pos.x += c_dx[threadIdx.x];\r
-                    pos.y += c_dy[threadIdx.x];\r
+                    pos.x += c_dx[threadIdx.x & 7];\r
+                    pos.y += c_dy[threadIdx.x & 7];\r
  \r
                      if (map.ptr(pos.y)[pos.x] == 1)\r
                      {\r
@@ -388,103 +419,75 @@ namespace cv { namespace gpu { namespace canny
                      }\r
                  }\r
                  __syncthreads();\r
+            }\r
  \r
-                while (s_counter > 0 && s_counter <= stack_size - blockDim.x)\r
+            if (s_counter > 0)\r
+            {\r
+                if (threadIdx.x == 0)\r
                  {\r
-                    const int subTaskIdx = threadIdx.x >> 3;\r
-                    const int portion = min(s_counter, blockDim.x >> 3);\r
-\r
-                    pos.x = pos.y = 0;\r
-\r
-                    if (subTaskIdx < portion)\r
-                        pos = s_st[s_counter - 1 - subTaskIdx];\r
-                    __syncthreads();\r
-                        \r
-                    if (threadIdx.x == 0)\r
-                        s_counter -= portion;\r
-                    __syncthreads();\r
-                     \r
-                    if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)\r
-                    {\r
-                        pos.x += c_dx[threadIdx.x & 7];\r
-                        pos.y += c_dy[threadIdx.x & 7];\r
-\r
-                        if (map.ptr(pos.y)[pos.x] == 1)\r
-                        {\r
-                            map.ptr(pos.y)[pos.x] = 2;\r
-\r
-                            ind = atomicInc(&s_counter, (unsigned int)(-1));\r
-\r
-                            s_st[ind] = pos;\r
-                        }\r
-                    }\r
-                    __syncthreads();\r
+                    ind = atomicAdd(&counter, s_counter);\r
+                    s_ind = ind - s_counter;\r
                  }\r
+                __syncthreads();\r
  \r
-                if (s_counter > 0)\r
-                {\r
-                    if (threadIdx.x == 0)\r
-                    {\r
-                        ind = atomicAdd(&counter, s_counter);\r
-                        s_ind = ind - s_counter;\r
-                    }\r
-                    __syncthreads();\r
-\r
-                    ind = s_ind;\r
+                ind = s_ind;\r
  \r
-                    for (int i = threadIdx.x; i < s_counter; i += blockDim.x)\r
-                    {\r
-                        st2[ind + i] = s_st[i];\r
-                    }\r
+                for (int i = threadIdx.x; i < s_counter; i += blockDim.x)\r
+                {\r
+                    st2[ind + i] = s_st[i];\r
                  }\r
              }\r
          }\r
-\r
-        #endif\r
      }\r
  \r
-    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)\r
-    {\r
-        void* counter_ptr;\r
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") );\r
-        \r
-        unsigned int count;\r
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+    #endif\r
+}\r
  \r
-        while (count > 0)\r
-        {\r
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );\r
+void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)\r
+{\r
+    void* counter_ptr;\r
+    cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );\r
+    \r
+    unsigned int count;\r
+    cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
  \r
-            dim3 block(128, 1, 1);\r
-            dim3 grid(min(count, 65535u), divUp(count, 65535), 1);\r
-            edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);\r
-            cudaSafeCall( cudaGetLastError() );\r
+    while (count > 0)\r
+    {\r
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );\r
  \r
-            cudaSafeCall(cudaThreadSynchronize());\r
+        dim3 block(128, 1, 1);\r
+        dim3 grid(min(count, 65535u), divUp(count, 65535), 1);\r
+        edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+        cudaSafeCall(cudaThreadSynchronize());\r
  \r
-            std::swap(st1, st2);\r
-        }\r
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );\r
+\r
+        std::swap(st1, st2);\r
      }\r
+}\r
  \r
-    __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)\r
-    {\r
-        const int j = blockIdx.x * 16 + threadIdx.x;\r
-        const int i = blockIdx.y * 16 + threadIdx.y;\r
+__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)\r
+{\r
+    const int j = blockIdx.x * 16 + threadIdx.x;\r
+    const int i = blockIdx.y * 16 + threadIdx.y;\r
  \r
-        if (i < rows && j < cols)\r
-            dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));\r
-    }\r
+    if (i < rows && j < cols)\r
+        dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));\r
+}\r
  \r
-    void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)\r
-    {\r
-        dim3 block(16, 16, 1);\r
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
+void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)\r
+{\r
+    dim3 block(16, 16, 1);\r
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);\r
  \r
-        getEdges<<<grid, block>>>(map, dst, rows, cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    getEdges<<<grid, block>>>(map, dst, rows, cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall(cudaThreadSynchronize());\r
-    }\r
-}}}\r
+    cudaSafeCall(cudaThreadSynchronize());\r
+}\r
+\r
+} // namespace canny\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu

index dfa7429..4da3f77 100644 (file)
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -44,336 +44,337 @@
  #include "opencv2/gpu/device/transform.hpp"\r
  #include "opencv2/gpu/device/color.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <> struct TransformFunctorTraits<bgra_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_rgba_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_x = 8 };\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<bgra_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<bgra_to_bgr555_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<rgba_to_bgr555_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<bgra_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<bgra_to_bgr565_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<rgba_to_bgr565_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<bgr555_to_bgra_traits::functor_type> : DefaultTransformFunctorTraits<bgr555_to_bgra_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<bgr555_to_rgba_traits::functor_type> : DefaultTransformFunctorTraits<bgr555_to_rgba_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<bgr565_to_bgra_traits::functor_type> : DefaultTransformFunctorTraits<bgr565_to_bgra_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<bgr565_to_rgba_traits::functor_type> : DefaultTransformFunctorTraits<bgr565_to_rgba_traits::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<gray_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<gray_to_bgra_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<gray_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<gray_to_bgr555_traits::functor_type>\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<gray_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<gray_to_bgr565_traits::functor_type>\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<bgra_to_yuv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_yuv4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_yuv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_yuv4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<yuv4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<yuv4_to_bgra_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<yuv4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<yuv4_to_rgba_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<bgra_to_YCrCb4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_YCrCb4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_YCrCb4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_YCrCb4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<YCrCb4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<YCrCb4_to_bgra_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<YCrCb4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<YCrCb4_to_rgba_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };    \r
-\r
-    template <> struct TransformFunctorTraits<bgra_to_xyz4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_xyz4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_xyz4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_xyz4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<xyz4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<xyz4_to_bgra_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<xyz4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<xyz4_to_rgba_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<bgra_to_hsv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_hsv4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_hsv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_hsv4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<hsv4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hsv4_to_bgra_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<hsv4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hsv4_to_rgba_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<bgra_to_hls4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_hls4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<rgba_to_hls4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_hls4_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<hls4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hls4_to_bgra_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits<hls4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hls4_to_rgba_traits<uchar>::functor_type>\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \\r
-        void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \\r
-        { \\r
-            traits::functor_type functor = traits::create_functor(); \\r
-            typedef typename traits::functor_type::argument_type src_t; \\r
-            typedef typename traits::functor_type::result_type   dst_t; \\r
-            transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \\r
-        }\r
-\r
-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)\r
-\r
-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)\r
-\r
-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \\r
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)\r
-\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)\r
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)\r
-\r
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR\r
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE\r
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL\r
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_x = 8 };\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};    \r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \\r
+    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \\r
+    { \\r
+        traits::functor_type functor = traits::create_functor(); \\r
+        typedef typename traits::functor_type::argument_type src_t; \\r
+        typedef typename traits::functor_type::result_type   dst_t; \\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \\r
+    }\r
+\r
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)\r
+\r
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)\r
+\r
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \\r
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)\r
+\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)\r
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)\r
+\r
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR\r
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE\r
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL\r
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu

index 29b828e..c16ca82 100644 (file)
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -47,8 +47,7 @@
  #include "opencv2/gpu/device/limits.hpp"\r
  #include "opencv2/gpu/device/border_interpolate.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
  #define MAX_KERNEL_SIZE 16\r
  #define BLOCK_DIM_X 16\r
@@ -56,195 +55,195 @@ using namespace cv::gpu::device;
  #define RESULT_STEPS 8\r
  #define HALO_STEPS 1\r
  \r
-namespace filter_column\r
+namespace column_filter {\r
+\r
+__constant__ float c_kernel[MAX_KERNEL_SIZE];\r
+\r
+void loadKernel(const float kernel[], int ksize)\r
  {\r
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
+}\r
  \r
-    void loadKernel(const float kernel[], int ksize)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
-    }\r
+template <int KERNEL_SIZE, typename T, typename D, typename B>\r
+__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
+{\r
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
  \r
-    template <int KERNEL_SIZE, typename T, typename D, typename B>\r
-    __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
-    {\r
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
+    __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];\r
  \r
-        __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];\r
+    //Offset to the upper halo edge\r
+    const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;\r
+    const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;\r
  \r
-        //Offset to the upper halo edge\r
-        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;\r
-        const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;\r
+    if (x < src.cols)\r
+    {\r
+        const T* src_col = src.ptr() + x;\r
  \r
-        if (x < src.cols)\r
-        {\r
-            const T* src_col = src.ptr() + x;\r
+        //Main data\r
+        #pragma unroll\r
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
  \r
-            //Main data\r
-            #pragma unroll\r
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
-                smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
+        //Upper halo\r
+        #pragma unroll\r
+        for(int i = 0; i < HALO_STEPS; ++i)\r
+            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);\r
  \r
-            //Upper halo\r
-            #pragma unroll\r
-            for(int i = 0; i < HALO_STEPS; ++i)\r
-                smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);\r
+        //Lower halo\r
+        #pragma unroll\r
+        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
+            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
  \r
-            //Lower halo\r
-            #pragma unroll\r
-            for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
-                smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);\r
+        __syncthreads();\r
  \r
-            __syncthreads();\r
+        #pragma unroll\r
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+        {\r
+            sum_t sum = VecTraits<sum_t>::all(0);\r
  \r
              #pragma unroll\r
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
-            {\r
-                sum_t sum = VecTraits<sum_t>::all(0);\r
-\r
-                #pragma unroll\r
-                for(int j = 0; j < KERNEL_SIZE; ++j)\r
-                    sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];\r
+            for(int j = 0; j < KERNEL_SIZE; ++j)\r
+                sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];\r
  \r
-                int dstY = y + i * BLOCK_DIM_Y;\r
+            int dstY = y + i * BLOCK_DIM_Y;\r
  \r
-                if (dstY < src.rows)\r
-                    dst.ptr(dstY)[x] = saturate_cast<D>(sum);\r
-            }\r
+            if (dstY < src.rows)\r
+                dst.ptr(dstY)[x] = saturate_cast<D>(sum);\r
          }\r
      }\r
  }\r
  \r
-namespace cv { namespace gpu { namespace filters\r
-{\r
-    template <int ksize, typename T, typename D, template<typename> class B>\r
-    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
-    {        \r
-        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
-        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));\r
+template <int ksize, typename T, typename D, template<typename> class B>\r
+void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
+{        \r
+    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
+    const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));\r
  \r
-        B<T> b(src.rows);\r
+    B<T> b(src.rows);\r
  \r
-        filter_column::linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <typename T, typename D>\r
-    void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
+template <typename T, typename D>\r
+void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
+    static const caller_t callers[5][17] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
-        static const caller_t callers[5][17] = \r
          {\r
-            {\r
-                0, \r
-                linearColumnFilter_caller<1 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<2 , T, D, BrdColReflect101>,\r
-                linearColumnFilter_caller<3 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<4 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<5 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<6 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<7 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<8 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<9 , T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<10, T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<11, T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<12, T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<13, T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<14, T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<15, T, D, BrdColReflect101>, \r
-                linearColumnFilter_caller<16, T, D, BrdColReflect101> \r
-            },\r
-            {\r
-                0, \r
-                linearColumnFilter_caller<1 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<2 , T, D, BrdColReplicate>,\r
-                linearColumnFilter_caller<3 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<4 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<5 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<6 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<7 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<8 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<9 , T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<10, T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<11, T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<12, T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<13, T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<14, T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<15, T, D, BrdColReplicate>, \r
-                linearColumnFilter_caller<16, T, D, BrdColReplicate>\r
-            },\r
-            {\r
-                0, \r
-                linearColumnFilter_caller<1 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<2 , T, D, BrdColConstant>,\r
-                linearColumnFilter_caller<3 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<4 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<5 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<6 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<7 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<8 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<9 , T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<10, T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<11, T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<12, T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<13, T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<14, T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<15, T, D, BrdColConstant>, \r
-                linearColumnFilter_caller<16, T, D, BrdColConstant> \r
-            },\r
-            {\r
-                0, \r
-                linearColumnFilter_caller<1 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<2 , T, D, BrdColReflect>,\r
-                linearColumnFilter_caller<3 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<4 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<5 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<6 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<7 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<8 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<9 , T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<10, T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<11, T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<12, T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<13, T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<14, T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<15, T, D, BrdColReflect>, \r
-                linearColumnFilter_caller<16, T, D, BrdColReflect>\r
-            },\r
-            {\r
-                0, \r
-                linearColumnFilter_caller<1 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<2 , T, D, BrdColWrap>,\r
-                linearColumnFilter_caller<3 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<4 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<5 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<6 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<7 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<8 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<9 , T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<10, T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<11, T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<12, T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<13, T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<14, T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<15, T, D, BrdColWrap>, \r
-                linearColumnFilter_caller<16, T, D, BrdColWrap>,\r
-            }\r
-        };\r
-        \r
-        filter_column::loadKernel(kernel, ksize);\r
-\r
-        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
-    }\r
+            0, \r
+            linearColumnFilter_caller<1 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<2 , T, D, BrdColReflect101>,\r
+            linearColumnFilter_caller<3 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<4 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<5 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<6 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<7 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<8 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<9 , T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<10, T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<11, T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<12, T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<13, T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<14, T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<15, T, D, BrdColReflect101>, \r
+            linearColumnFilter_caller<16, T, D, BrdColReflect101> \r
+        },\r
+        {\r
+            0, \r
+            linearColumnFilter_caller<1 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<2 , T, D, BrdColReplicate>,\r
+            linearColumnFilter_caller<3 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<4 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<5 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<6 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<7 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<8 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<9 , T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<10, T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<11, T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<12, T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<13, T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<14, T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<15, T, D, BrdColReplicate>, \r
+            linearColumnFilter_caller<16, T, D, BrdColReplicate>\r
+        },\r
+        {\r
+            0, \r
+            linearColumnFilter_caller<1 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<2 , T, D, BrdColConstant>,\r
+            linearColumnFilter_caller<3 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<4 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<5 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<6 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<7 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<8 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<9 , T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<10, T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<11, T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<12, T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<13, T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<14, T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<15, T, D, BrdColConstant>, \r
+            linearColumnFilter_caller<16, T, D, BrdColConstant> \r
+        },\r
+        {\r
+            0, \r
+            linearColumnFilter_caller<1 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<2 , T, D, BrdColReflect>,\r
+            linearColumnFilter_caller<3 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<4 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<5 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<6 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<7 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<8 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<9 , T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<10, T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<11, T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<12, T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<13, T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<14, T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<15, T, D, BrdColReflect>, \r
+            linearColumnFilter_caller<16, T, D, BrdColReflect>\r
+        },\r
+        {\r
+            0, \r
+            linearColumnFilter_caller<1 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<2 , T, D, BrdColWrap>,\r
+            linearColumnFilter_caller<3 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<4 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<5 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<6 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<7 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<8 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<9 , T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<10, T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<11, T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<12, T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<13, T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<14, T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<15, T, D, BrdColWrap>, \r
+            linearColumnFilter_caller<16, T, D, BrdColWrap>,\r
+        }\r
+    };\r
+    \r
+    loadKernel(kernel, ksize);\r
+\r
+    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
+}\r
+\r
+template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+\r
+} // namespace column_filter\r
  \r
-    template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu

index 89af4e0..5603742 100644 (file)
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -43,85 +43,87 @@
  #include "internal_shared.hpp"\r
  #include "opencv2/gpu/device/border_interpolate.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+namespace copy_make_border {\r
+\r
+template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)\r
  {\r
-    template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)\r
-    {\r
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+    if (x < dst.cols && y < dst.rows)\r
+        dst.ptr(y)[x] = src(y - top, x - left);\r
+}\r
+\r
+template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher\r
+{\r
+    static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, \r
+        const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)\r
+    {        \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+        B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));\r
+        BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);\r
  \r
-        if (x < dst.cols && y < dst.rows)\r
-            dst.ptr(y)[x] = src(y - top, x - left);\r
+        copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);\r
+        cudaSafeCall( cudaGetLastError() );\r
+\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
      }\r
+};\r
  \r
-    template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher\r
+template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, \r
+    const T* borderValue, cudaStream_t stream)\r
+{\r
+    typedef typename TypeVec<T, cn>::vec_type vec_type;\r
+\r
+    typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);\r
+\r
+    static const caller_t callers[5] = \r
      {\r
-        static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, \r
-            const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)\r
-        {        \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+        CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, \r
+        CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, \r
+        CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, \r
+        CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, \r
+        CopyMakeBorderDispatcher<BrdWrap, vec_type>::call \r
+    };\r
  \r
-            B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));\r
-            BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);\r
+    callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);\r
+}\r
  \r
-            copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);\r
-            cudaSafeCall( cudaGetLastError() );\r
+template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
  \r
-            if (stream == 0)\r
-                cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
+//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
  \r
-    template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, \r
-        const T* borderValue, cudaStream_t stream)\r
-    {\r
-        typedef typename TypeVec<T, cn>::vec_type vec_type;\r
+template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
  \r
-        typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
  \r
-        static const caller_t callers[5] = \r
-        {\r
-            CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, \r
-            CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, \r
-            CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, \r
-            CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, \r
-            CopyMakeBorderDispatcher<BrdWrap, vec_type>::call \r
-        };\r
+//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
  \r
-        callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);\r
-    }\r
+template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
+\r
+} // namespace copy_make_border\r
  \r
-    template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);\r
-    \r
-    //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);\r
-    \r
-    template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);\r
-    \r
-    template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);\r
-    \r
-    //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);\r
-    \r
-    template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-    //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-    template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu

index cd72468..7920cc8 100644 (file)
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -40,2040 +40,2041 @@
  //\r
  //M*/\r
  \r
+#include "internal_shared.hpp"\r
  #include "opencv2/gpu/device/functional.hpp"\r
  #include "opencv2/gpu/device/vec_math.hpp"\r
  #include "opencv2/gpu/device/transform.hpp"\r
  #include "opencv2/gpu/device/limits.hpp"\r
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
-#include "internal_shared.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // add\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T, typename D> struct Add : binary_function<T, T, D>\r
-    {\r
-        __device__ __forceinline__ D operator ()(T a, T b) const\r
-        {\r
-            return saturate_cast<D>(a + b);\r
-        }\r
-    };\r
+//////////////////////////////////////////////////////////////////////////\r
+// add\r
  \r
-    template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <typename T, typename D> void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+template <typename T, typename D> struct Add : binary_function<T, T, D>\r
+{\r
+    __device__ __forceinline__ D operator ()(T a, T b) const\r
      {\r
-        if (mask.data)\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Add<T, D>(), stream);\r
-        else\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Add<T, D>(), stream);\r
+        return saturate_cast<D>(a + b);\r
      }\r
+};\r
  \r
-    template void add_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    template <typename T, typename D> struct AddScalar : unary_function<T, D>\r
-    {\r
-        AddScalar(double val_) : val(val_) {}\r
-        __device__ __forceinline__ D operator ()(T a) const\r
-        {\r
-            return saturate_cast<D>(a + val);\r
-        }\r
-        const double val;\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort>  >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <typename T, typename D> void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+template <typename T, typename D> void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Add<T, D>(), stream);\r
+    else\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Add<T, D>(), stream);\r
+}\r
+\r
+template void add_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+template <typename T, typename D> struct AddScalar : unary_function<T, D>\r
+{\r
+    AddScalar(double val_) : val(val_) {}\r
+    __device__ __forceinline__ D operator ()(T a) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
-        AddScalar<T, D> op(val);\r
-        if (mask.data)\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
-        else\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+        return saturate_cast<D>(a + val);\r
      }\r
+    const double val;\r
+};\r
  \r
-    template void add_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void add_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void add_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void add_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // subtract\r
-\r
-    template <typename T, typename D> struct Subtract : binary_function<T, T, D>\r
-    {\r
-        __device__ __forceinline__ D operator ()(T a, T b) const\r
-        {\r
-            return saturate_cast<D>(a - b);\r
-        }\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort>  >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+template <typename T, typename D> void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+    AddScalar<T, D> op(val);\r
+    if (mask.data)\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
+    else\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void add_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void add_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void add_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void add_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//////////////////////////////////////////////////////////////////////////\r
+// subtract\r
+\r
+template <typename T, typename D> struct Subtract : binary_function<T, T, D>\r
+{\r
+    __device__ __forceinline__ D operator ()(T a, T b) const\r
      {\r
-        if (mask.data)\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Subtract<T, D>(), stream);\r
-        else\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Subtract<T, D>(), stream);\r
+        return saturate_cast<D>(a - b);\r
      }\r
+};\r
  \r
-    template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    template <typename T, typename D> struct SubtractScalar : unary_function<T, D>\r
-    {\r
-        SubtractScalar(double val_) : val(val_) {}\r
-        __device__ __forceinline__ D operator ()(T a) const\r
-        {\r
-            return saturate_cast<D>(a - val);\r
-        }\r
-        const double val;\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort>  >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+{\r
+    if (mask.data)\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, mask, Subtract<T, D>(), stream);\r
+    else\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, Subtract<T, D>(), stream);\r
+}\r
+\r
+template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<ushort, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<short, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<short, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<int, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<int, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<int, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<int, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<int, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<float, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<float, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<double, uchar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, schar>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, short>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, int>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, float>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+template <typename T, typename D> struct SubtractScalar : unary_function<T, D>\r
+{\r
+    SubtractScalar(double val_) : val(val_) {}\r
+    __device__ __forceinline__ D operator ()(T a) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
-        SubtractScalar<T, D> op(val);\r
-        if (mask.data)\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
-        else\r
-            transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+        return saturate_cast<D>(a - val);\r
      }\r
+    const double val;\r
+};\r
  \r
-    template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //template void subtract_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    //template void subtract_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-    template void subtract_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // multiply\r
-\r
-    struct multiply_8uc4_32f : binary_function<uint, float, uint>\r
-    {\r
-        __device__ __forceinline__ uint operator ()(uint a, float b) const\r
-        {\r
-            uint res = 0;\r
-\r
-            res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );\r
-            res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);\r
-            res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);\r
-            res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);\r
-\r
-            return res;\r
-        }\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<multiply_8uc4_32f> : DefaultTransformFunctorTraits<multiply_8uc4_32f>\r
-    {\r
-        enum { smart_block_dim_x = 8 };\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 8 };\r
-    };\r
-\r
-    void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
-    {\r
-        transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);\r
-    }\r
+template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort>  >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    struct multiply_16sc4_32f : binary_function<short4, float, short4>\r
+template <typename T, typename D> void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+    SubtractScalar<T, D> op(val);\r
+    if (mask.data)\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, mask, op, stream);\r
+    else\r
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void subtract_gpu<uchar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<uchar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<schar, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<ushort, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<ushort, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<ushort, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<short, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<short, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<int, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<int, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<int, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<int, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<int, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<float, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<float, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<float, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//template void subtract_gpu<double, uchar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, schar>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, short>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, int>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+//template void subtract_gpu<double, float>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+template void subtract_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+//////////////////////////////////////////////////////////////////////////\r
+// multiply\r
+\r
+struct multiply_8uc4_32f : binary_function<uint, float, uint>\r
+{\r
+    __device__ __forceinline__ uint operator ()(uint a, float b) const\r
      {\r
-        __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
-        {\r
-            return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),\r
-                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));\r
-        }\r
-    };\r
+        uint res = 0;\r
  \r
-    template <> struct TransformFunctorTraits<multiply_16sc4_32f> : DefaultTransformFunctorTraits<multiply_16sc4_32f>\r
-    {\r
-        enum { smart_block_dim_x = 8 };\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 8 };\r
-    };\r
+        res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );\r
+        res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);\r
+        res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);\r
+        res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);\r
  \r
-    void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
-    {\r
-        transform(static_cast< DevMem2D_<short4> >(src1), src2, \r
-                  static_cast< DevMem2D_<short4> >(dst), multiply_16sc4_32f(), stream);\r
+        return res;\r
      }\r
+};\r
  \r
-    template <typename T, typename D> struct Multiply : binary_function<T, T, D>\r
-    {\r
-        Multiply(double scale_) : scale(scale_) {}\r
-        __device__ __forceinline__ D operator ()(T a, T b) const\r
-        {\r
-            return saturate_cast<D>(scale * a * b);\r
-        }\r
-        const double scale;\r
-    };\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)\r
+{\r
+    enum { smart_block_dim_x = 8 };\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 8 };\r
+};\r
  \r
-    template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
+{\r
+    transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);\r
+}\r
  \r
-    template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+struct multiply_16sc4_32f : binary_function<short4, float, short4>\r
+{\r
+    __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
-        Multiply<T, D> op(scale);\r
-        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
+        return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),\r
+                           saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));\r
      }\r
+};\r
  \r
-    template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    \r
-    template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>\r
-    {\r
-        MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
-        __device__ __forceinline__ D operator ()(T a) const\r
-        {\r
-            return saturate_cast<D>(scale * a * val);\r
-        }\r
-        const double val;\r
-        const double scale;\r
-    };\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)\r
+{\r
+    enum { smart_block_dim_x = 8 };\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 8 };\r
+};\r
  \r
-    template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
+{\r
+    transform(static_cast< DevMem2D_<short4> >(src1), src2, \r
+              static_cast< DevMem2D_<short4> >(dst), multiply_16sc4_32f(), stream);\r
+}\r
  \r
-    template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+template <typename T, typename D> struct Multiply : binary_function<T, T, D>\r
+{\r
+    Multiply(double scale_) : scale(scale_) {}\r
+    __device__ __forceinline__ D operator ()(T a, T b) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
-        MultiplyScalar<T, D> op(val, scale);\r
-        transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+        return saturate_cast<D>(scale * a * b);\r
      }\r
+    const double scale;\r
+};\r
  \r
-    template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void multiply_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void multiply_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void multiply_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // divide\r
-\r
-    struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>\r
-    {\r
-        __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const\r
-        {\r
-            return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),\r
-                                        saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) \r
-                          : make_uchar4(0,0,0,0);\r
-        }\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<divide_8uc4_32f> : DefaultTransformFunctorTraits<divide_8uc4_32f>\r
-    {\r
-        enum { smart_block_dim_x = 8 };\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 8 };\r
-    };\r
+template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
+template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+    Multiply<T, D> op(scale);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>\r
+{\r
+    MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
+    __device__ __forceinline__ D operator ()(T a) const\r
      {\r
-        transform(static_cast< DevMem2D_<uchar4> >(src1), src2, static_cast< DevMem2D_<uchar4> >(dst), divide_8uc4_32f(), stream);\r
+        return saturate_cast<D>(scale * a * val);\r
      }\r
+    const double val;\r
+    const double scale;\r
+};\r
  \r
+template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    struct divide_16sc4_32f : binary_function<short4, float, short4>\r
-    {\r
-        __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
-        {\r
-            return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),\r
-                                        saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))\r
-                          : make_short4(0,0,0,0);\r
-        }\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits<divide_16sc4_32f> : DefaultTransformFunctorTraits<divide_16sc4_32f>\r
-    {\r
-        enum { smart_block_dim_x = 8 };\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 8 };\r
-    };\r
-\r
-    void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
+template <typename T, typename D> void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+    cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+    MultiplyScalar<T, D> op(val, scale);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void multiply_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void multiply_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void multiply_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void multiply_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//////////////////////////////////////////////////////////////////////////\r
+// divide\r
+\r
+struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>\r
+{\r
+    __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const\r
      {\r
-        transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), divide_16sc4_32f(), stream);\r
+        return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),\r
+                                    saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) \r
+                      : make_uchar4(0,0,0,0);\r
      }\r
+};\r
  \r
-    template <typename T, typename D> struct Divide : binary_function<T, T, D>\r
-    {\r
-        Divide(double scale_) : scale(scale_) {}\r
-        __device__ __forceinline__ D operator ()(T a, T b) const\r
-        {\r
-            return b != 0 ? saturate_cast<D>(scale * a / b) : 0;\r
-        }\r
-        const double scale;\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-\r
-    template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
-    {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
-        Divide<T, D> op(scale);\r
-        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
-    }\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)\r
+{\r
+    enum { smart_block_dim_x = 8 };\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 8 };\r
+};\r
  \r
-    template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    \r
-    template <typename T, typename D> struct DivideScalar : unary_function<T, D>\r
-    {\r
-        DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
-        __device__ __forceinline__ D operator ()(T a) const\r
-        {\r
-            return saturate_cast<D>(scale * a / val);\r
-        }\r
-        const double val;\r
-        const double scale;\r
-    };\r
+void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)\r
+{\r
+    transform(static_cast< DevMem2D_<uchar4> >(src1), src2, static_cast< DevMem2D_<uchar4> >(dst), divide_8uc4_32f(), stream);\r
+}\r
  \r
-    template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
  \r
-    template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+struct divide_16sc4_32f : binary_function<short4, float, short4>\r
+{\r
+    __device__ __forceinline__ short4 operator ()(short4 a, float b) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
-        DivideScalar<T, D> op(val, scale);\r
-        transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+        return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),\r
+                                    saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))\r
+                      : make_short4(0,0,0,0);\r
      }\r
+};\r
  \r
-    template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    //template void divide_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    template void divide_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-\r
-    template <typename T, typename D> struct Reciprocal : unary_function<T, D>\r
-    {\r
-        Reciprocal(double scale_) : scale(scale_) {}\r
-        __device__ __forceinline__ D operator ()(T a) const\r
-        {\r
-            return a != 0 ? saturate_cast<D>(scale / a) : 0;\r
-        }\r
-        const double scale;\r
-    };\r
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)\r
+{\r
+    enum { smart_block_dim_x = 8 };\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 8 };\r
+};\r
  \r
-    template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream)\r
+{\r
+    transform(static_cast< DevMem2D_<short4> >(src1), src2, static_cast< DevMem2D_<short4> >(dst), divide_16sc4_32f(), stream);\r
+}\r
  \r
-    template <typename T, typename D> void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+template <typename T, typename D> struct Divide : binary_function<T, T, D>\r
+{\r
+    Divide(double scale_) : scale(scale_) {}\r
+    __device__ __forceinline__ D operator ()(T a, T b) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&scalar) );\r
-        Reciprocal<T, D> op(scalar);\r
-        transform((DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
+        return b != 0 ? saturate_cast<D>(scale * a / b) : 0;\r
      }\r
+    const double scale;\r
+};\r
  \r
-    template void divide_gpu<uchar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<uchar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<uchar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<uchar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<uchar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<uchar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<uchar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<schar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<schar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<schar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<schar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<schar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<schar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<schar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<ushort, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<ushort, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<ushort, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<ushort, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<ushort, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<ushort, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<ushort, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<short, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<short, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<short, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<short, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<short, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<short, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<short, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<int, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<int, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<int, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<int, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<int, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<int, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<int, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<float, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<float, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<float, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<float, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<float, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<float, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<float, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //template void divide_gpu<double, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<double, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<double, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<double, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<double, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void divide_gpu<double, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void divide_gpu<double, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // absdiff\r
-\r
-    template <typename T> struct Absdiff : binary_function<T, T, T>\r
-    {\r
-        static __device__ __forceinline__ int abs(int a)\r
-        {\r
-            return ::abs(a);\r
-        }\r
-        static __device__ __forceinline__ float abs(float a)\r
-        {\r
-            return ::fabsf(a);\r
-        }\r
-        static __device__ __forceinline__ double abs(double a)\r
-        {\r
-            return ::fabs(a);\r
-        }\r
-\r
-        __device__ __forceinline__ T operator ()(T a, T b) const\r
-        {\r
-            return saturate_cast<T>(abs(a - b));\r
-        }\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+    Divide<T, D> op(scale);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<uchar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<schar, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<ushort, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<ushort, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<short, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<short, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<short, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<int, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<int, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<int, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<int, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<int, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<int, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<int, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<float, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<float, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<float, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<double, uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<double, double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+template <typename T, typename D> struct DivideScalar : unary_function<T, D>\r
+{\r
+    DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}\r
+    __device__ __forceinline__ D operator ()(T a) const\r
      {\r
-        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), stream);\r
+        return saturate_cast<D>(scale * a / val);\r
      }\r
+    const double val;\r
+    const double scale;\r
+};\r
  \r
-    //template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void absdiff_gpu<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    //template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template <typename T> struct AbsdiffScalar : unary_function<T, T>\r
-    {\r
-        AbsdiffScalar(double val_) : val(val_) {}\r
-        __device__ __forceinline__ T operator ()(T a) const\r
-        {\r
-            return saturate_cast<T>(::fabs(a - val));\r
-        }\r
-        double val;\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)\r
+template <typename T, typename D> void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+    cudaSafeCall( cudaSetDoubleForDevice(&scale) );\r
+    DivideScalar<T, D> op(val, scale);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void divide_gpu<uchar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<uchar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<uchar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<schar, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<schar, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<ushort, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<ushort, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<ushort, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<ushort, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<short, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<short, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<short, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<short, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<int, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<int, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<int, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<int, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<int, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<int, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<int, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<float, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<float, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<float, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<float, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+//template void divide_gpu<double, uchar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, schar >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, ushort>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, short >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, int   >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+//template void divide_gpu<double, float >(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template void divide_gpu<double, double>(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+template <typename T, typename D> struct Reciprocal : unary_function<T, D>\r
+{\r
+    Reciprocal(double scale_) : scale(scale_) {}\r
+    __device__ __forceinline__ D operator ()(T a) const\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
-        AbsdiffScalar<T> op(val);\r
-        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, stream);\r
+        return a != 0 ? saturate_cast<D>(scale / a) : 0;\r
      }\r
+    const double scale;\r
+};\r
  \r
-    template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  \r
-    template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  \r
-    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  \r
-    template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    //////////////////////////////////////////////////////////////////////////////////////\r
-    // Compare\r
-\r
-    template <typename T> struct Equal : binary_function<T, T, uchar>\r
-    {\r
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
-        {\r
-            return static_cast<uchar>((src1 == src2) * 255);\r
-        }\r
-    };\r
-    template <typename T> struct NotEqual : binary_function<T, T, uchar>\r
-    {\r
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
-        {\r
-            return static_cast<uchar>((src1 != src2) * 255);\r
-        }\r
-    };\r
-    template <typename T> struct Less : binary_function<T, T, uchar>\r
-    {\r
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
-        {\r
-            return static_cast<uchar>((src1 < src2) * 255);\r
-        }\r
-    };\r
-    template <typename T> struct LessEqual : binary_function<T, T, uchar>\r
-    {\r
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
-        {\r
-            return static_cast<uchar>((src1 <= src2) * 255);\r
-        }\r
-    };\r
-\r
-    template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >\r
+template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+template <typename T, typename D> void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&scalar) );\r
+    Reciprocal<T, D> op(scalar);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src2, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+template void divide_gpu<uchar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<uchar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<uchar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<uchar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<uchar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<uchar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<uchar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//template void divide_gpu<schar, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<schar, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<schar, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<schar, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<schar, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<schar, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<schar, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//template void divide_gpu<ushort, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<ushort, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<ushort, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<ushort, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<ushort, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<ushort, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<ushort, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//template void divide_gpu<short, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<short, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<short, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<short, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<short, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<short, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<short, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//template void divide_gpu<int, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<int, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<int, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<int, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<int, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<int, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<int, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//template void divide_gpu<float, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<float, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<float, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<float, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<float, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<float, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<float, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//template void divide_gpu<double, uchar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<double, schar >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<double, ushort>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<double, short >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<double, int   >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void divide_gpu<double, float >(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void divide_gpu<double, double>(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//////////////////////////////////////////////////////////////////////////\r
+// absdiff\r
+\r
+template <typename T> struct Absdiff : binary_function<T, T, T>\r
+{\r
+    static __device__ __forceinline__ int abs(int a)\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >\r
+        return ::abs(a);\r
+    }\r
+    static __device__ __forceinline__ float abs(float a)\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >\r
+        return ::fabsf(a);\r
+    }\r
+    static __device__ __forceinline__ double abs(double a)\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >\r
+        return ::fabs(a);\r
+    }\r
+\r
+    __device__ __forceinline__ T operator ()(T a, T b) const\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
+        return saturate_cast<T>(::abs(a - b));\r
+    }\r
+};\r
+\r
+template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+template <typename T> void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, Absdiff<T>(), stream);\r
+}\r
+\r
+//template void absdiff_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void absdiff_gpu<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+//template void absdiff_gpu<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template <typename T> struct AbsdiffScalar : unary_function<T, T>\r
+{\r
+    AbsdiffScalar(double val_) : val(val_) {}\r
+    __device__ __forceinline__ T operator ()(T a) const\r
      {\r
-        Op<T> op;\r
-        transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);\r
+        return saturate_cast<T>(::fabs(a - val));\r
      }\r
+    double val;\r
+};\r
+\r
+template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+template <typename T> void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&val) );\r
+    AbsdiffScalar<T> op(val);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, stream);\r
+}\r
+\r
+template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  \r
+template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  \r
+//template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  \r
+template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+//////////////////////////////////////////////////////////////////////////////////////\r
+// Compare\r
+\r
+template <typename T> struct Equal : binary_function<T, T, uchar>\r
+{\r
+    __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
      {\r
-        compare<Equal, T>(src1, src2, dst, stream);\r
+        return static_cast<uchar>((src1 == src2) * 255);\r
      }\r
-    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+};\r
+template <typename T> struct NotEqual : binary_function<T, T, uchar>\r
+{\r
+    __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
      {\r
-        compare<NotEqual, T>(src1, src2, dst, stream);\r
+        return static_cast<uchar>((src1 != src2) * 255);\r
      }\r
-    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+};\r
+template <typename T> struct Less : binary_function<T, T, uchar>\r
+{\r
+    __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
      {\r
-        compare<Less, T>(src1, src2, dst, stream);\r
+        return static_cast<uchar>((src1 < src2) * 255);\r
      }\r
-    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+};\r
+template <typename T> struct LessEqual : binary_function<T, T, uchar>\r
+{\r
+    __device__ __forceinline__ uchar operator()(T src1, T src2) const\r
      {\r
-        compare<LessEqual, T>(src1, src2, dst, stream);\r
+        return static_cast<uchar>((src1 <= src2) * 255);\r
      }\r
-    \r
-    template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_eq<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    \r
-    template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_ne<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    \r
-    template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_lt<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    \r
-    template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_le<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // Unary bitwise logical matrix operations\r
-\r
-    enum { UN_OP_NOT };\r
-\r
-    template <typename T, int opid>\r
-    struct UnOp;\r
-\r
-    template <typename T>\r
-    struct UnOp<T, UN_OP_NOT>\r
-    { \r
-        static __device__ __forceinline__ T call(T v) { return ~v; }\r
-    };\r
+};\r
  \r
+template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
  \r
-    template <int opid>\r
-    __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)\r
-    {\r
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    Op<T> op;\r
+    OPENCV_DEVICE_NAMESPACE_ transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);\r
+}\r
  \r
-        if (y < rows) \r
+template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    compare<Equal, T>(src1, src2, dst, stream);\r
+}\r
+template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    compare<NotEqual, T>(src1, src2, dst, stream);\r
+}\r
+template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    compare<Less, T>(src1, src2, dst, stream);\r
+}\r
+template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    compare<LessEqual, T>(src1, src2, dst, stream);\r
+}\r
+\r
+template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_eq<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_ne<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_lt<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_le<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+//////////////////////////////////////////////////////////////////////////\r
+// Unary bitwise logical matrix operations\r
+\r
+enum { UN_OP_NOT };\r
+\r
+template <typename T, int opid>\r
+struct UnOp;\r
+\r
+template <typename T>\r
+struct UnOp<T, UN_OP_NOT>\r
+{ \r
+    static __device__ __forceinline__ T call(T v) { return ~v; }\r
+};\r
+\r
+\r
+template <int opid>\r
+__global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)\r
+{\r
+    const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+    if (y < rows) \r
+    {\r
+        uchar* dst_ptr = dst.ptr(y) + x;\r
+        const uchar* src_ptr = src.ptr(y) + x;\r
+        if (x + sizeof(uint) - 1 < width)\r
          {\r
-            uchar* dst_ptr = dst.ptr(y) + x;\r
-            const uchar* src_ptr = src.ptr(y) + x;\r
-            if (x + sizeof(uint) - 1 < width)\r
-            {\r
-                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);\r
-            }\r
-            else\r
+            *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);\r
+        }\r
+        else\r
+        {\r
+            const uchar* src_end = src.ptr(y) + width;\r
+            while (src_ptr < src_end)\r
              {\r
-                const uchar* src_end = src.ptr(y) + width;\r
-                while (src_ptr < src_end)\r
-                {\r
-                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);\r
-                }\r
+                *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);\r
              }\r
          }\r
      }\r
+}\r
  \r
  \r
-    template <int opid>\r
-    void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, \r
-                     cudaStream_t stream)\r
-    {\r
-        dim3 threads(16, 16);\r
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), \r
-                  divUp(rows, threads.y));\r
+template <int opid>\r
+void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, \r
+                 cudaStream_t stream)\r
+{\r
+    dim3 threads(16, 16);\r
+    dim3 grid(divUp(width, threads.x * sizeof(uint)), \r
+              divUp(rows, threads.y));\r
  \r
-        bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0) \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0) \r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    template <typename T, int opid>\r
-    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, \r
-                                      const PtrStepb mask, PtrStepb dst)\r
-    {\r
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+template <typename T, int opid>\r
+__global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, \r
+                                  const PtrStepb mask, PtrStepb dst)\r
+{\r
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
-        {\r
-            T* dst_row = (T*)dst.ptr(y);\r
-            const T* src_row = (const T*)src.ptr(y);\r
+    if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
+    {\r
+        T* dst_row = (T*)dst.ptr(y);\r
+        const T* src_row = (const T*)src.ptr(y);\r
  \r
-            dst_row[x] = UnOp<T, opid>::call(src_row[x]);\r
-        }\r
+        dst_row[x] = UnOp<T, opid>::call(src_row[x]);\r
      }\r
+}\r
  \r
  \r
-    template <typename T, int opid>\r
-    void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, \r
-                     const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        dim3 threads(16, 16);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+template <typename T, int opid>\r
+void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, \r
+                 const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    dim3 threads(16, 16);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); \r
-        cudaSafeCall( cudaGetLastError() );\r
+    bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); \r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0) \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0) \r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, \r
-                          const PtrStepb src, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);\r
-    }\r
+void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, \r
+                      const PtrStepb src, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);\r
+}\r
  \r
  \r
-    template <typename T>\r
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, \r
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);\r
-    }\r
+template <typename T>\r
+void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, \r
+                          const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);\r
+}\r
  \r
-    template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // Binary bitwise logical matrix operations\r
+//////////////////////////////////////////////////////////////////////////\r
+// Binary bitwise logical matrix operations\r
  \r
-    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };\r
+enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };\r
  \r
-    template <typename T, int opid>\r
-    struct BinOp;\r
+template <typename T, int opid>\r
+struct BinOp;\r
  \r
-    template <typename T>\r
-    struct BinOp<T, BIN_OP_OR>\r
-    { \r
-        static __device__ __forceinline__ T call(T a, T b) { return a | b; } \r
-    };\r
+template <typename T>\r
+struct BinOp<T, BIN_OP_OR>\r
+{ \r
+    static __device__ __forceinline__ T call(T a, T b) { return a | b; } \r
+};\r
  \r
  \r
-    template <typename T>\r
-    struct BinOp<T, BIN_OP_AND>\r
-    { \r
-        static __device__ __forceinline__ T call(T a, T b) { return a & b; } \r
-    };\r
+template <typename T>\r
+struct BinOp<T, BIN_OP_AND>\r
+{ \r
+    static __device__ __forceinline__ T call(T a, T b) { return a & b; } \r
+};\r
  \r
-    template <typename T>\r
-    struct BinOp<T, BIN_OP_XOR>\r
-    { \r
-        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } \r
-    };\r
+template <typename T>\r
+struct BinOp<T, BIN_OP_XOR>\r
+{ \r
+    static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } \r
+};\r
  \r
  \r
-    template <int opid>\r
-    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, \r
-                                       const PtrStepb src2, PtrStepb dst)\r
+template <int opid>\r
+__global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, \r
+                                   const PtrStepb src2, PtrStepb dst)\r
+{\r
+    const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+    if (y < rows) \r
      {\r
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+        uchar* dst_ptr = dst.ptr(y) + x;\r
+        const uchar* src1_ptr = src1.ptr(y) + x;\r
+        const uchar* src2_ptr = src2.ptr(y) + x;\r
  \r
-        if (y < rows) \r
+        if (x + sizeof(uint) - 1 < width)\r
          {\r
-            uchar* dst_ptr = dst.ptr(y) + x;\r
-            const uchar* src1_ptr = src1.ptr(y) + x;\r
-            const uchar* src2_ptr = src2.ptr(y) + x;\r
-\r
-            if (x + sizeof(uint) - 1 < width)\r
-            {\r
-                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);\r
-            }\r
-            else\r
+            *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);\r
+        }\r
+        else\r
+        {\r
+            const uchar* src1_end = src1.ptr(y) + width;\r
+            while (src1_ptr < src1_end)\r
              {\r
-                const uchar* src1_end = src1.ptr(y) + width;\r
-                while (src1_ptr < src1_end)\r
-                {\r
-                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);\r
-                }\r
+                *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);\r
              }\r
          }\r
      }\r
+}\r
  \r
  \r
-    template <int opid>\r
-    void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, \r
-                      PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        dim3 threads(16, 16);\r
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));\r
+template <int opid>\r
+void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, \r
+                  PtrStepb dst, cudaStream_t stream)\r
+{\r
+    dim3 threads(16, 16);\r
+    dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));\r
  \r
-        bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0) \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0) \r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    template <typename T, int opid>\r
-    __global__ void bitwiseBinOpKernel(\r
-            int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
-            const PtrStepb mask, PtrStepb dst)\r
-    {\r
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+template <typename T, int opid>\r
+__global__ void bitwiseBinOpKernel(\r
+        int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+        const PtrStepb mask, PtrStepb dst)\r
+{\r
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
-        {\r
-            T* dst_row = (T*)dst.ptr(y);\r
-            const T* src1_row = (const T*)src1.ptr(y);\r
-            const T* src2_row = (const T*)src2.ptr(y);\r
+    if (x < cols && y < rows && mask.ptr(y)[x / cn]) \r
+    {\r
+        T* dst_row = (T*)dst.ptr(y);\r
+        const T* src1_row = (const T*)src1.ptr(y);\r
+        const T* src2_row = (const T*)src2.ptr(y);\r
  \r
-            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);\r
-        }\r
+        dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);\r
      }\r
+}\r
  \r
  \r
-    template <typename T, int opid>\r
-    void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
-                        const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        dim3 threads(16, 16);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
-\r
-        bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+template <typename T, int opid>\r
+void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+                    const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    dim3 threads(16, 16);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        if (stream == 0) \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
+    if (stream == 0) \r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
-                         const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
-    }\r
  \r
+void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
+                     const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
+}\r
  \r
-    template <typename T>\r
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
-                             const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
-    }\r
  \r
-    template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template <typename T>\r
+void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+                         const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
+}\r
  \r
+template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
  \r
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
-                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
-    }\r
  \r
+void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
+                      const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
+}\r
  \r
-    template <typename T>\r
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
-    }\r
  \r
-    template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template <typename T>\r
+void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+                          const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
+}\r
  \r
+template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
  \r
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
-                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
-    }\r
  \r
+void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, \r
+                      const PtrStepb src2, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);\r
+}\r
  \r
-    template <typename T>\r
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
-    {\r
-        bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
-    }\r
  \r
-    template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-    template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template <typename T>\r
+void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, \r
+                          const PtrStepb mask, PtrStepb dst, cudaStream_t stream)\r
+{\r
+    bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);\r
+}\r
  \r
+template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
+template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // min/max\r
  \r
-    namespace detail\r
-    {\r
-        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>\r
-        {\r
-        };\r
-        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_shift = 4 };\r
-        };\r
-        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_block_dim_y = 4 };\r
-            enum { smart_shift = 4 };\r
-        };\r
-    }\r
+//////////////////////////////////////////////////////////////////////////\r
+// min/max\r
  \r
-    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >\r
-    {\r
-    };\r
-    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >\r
+namespace detail\r
+{\r
+    template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>\r
      {\r
      };\r
-    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >\r
+    template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>\r
      {\r
+        enum { smart_shift = 4 };\r
      };\r
-    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >\r
+    template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>\r
      {\r
+        enum { smart_block_dim_y = 4 };\r
+        enum { smart_shift = 4 };\r
      };\r
-    \r
-    template <typename T>\r
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-    {\r
-        transform(src1, src2, dst, minimum<T>(), stream);    \r
-    }\r
-\r
-    template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
-\r
-    template <typename T>\r
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-    {\r
-        transform(src1, src2, dst, maximum<T>(), stream);    \r
-    }\r
-    \r
-    template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
-\r
-    template <typename T>\r
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-    {\r
-        transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);    \r
-    }\r
+}\r
  \r
-    template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-    template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-    template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-    template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >\r
+{\r
+};\r
  \r
-    template <typename T>\r
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-    {\r
-        transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);    \r
-    }\r
+template <typename T>\r
+void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+{\r
+    OPENCV_DEVICE_NAMESPACE_ transform(src1, src2, dst, minimum<T>(), stream);    \r
+}\r
+\r
+template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void min_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+{\r
+    OPENCV_DEVICE_NAMESPACE_ transform(src1, src2, dst, maximum<T>(), stream);    \r
+}\r
+\r
+template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void max_gpu<schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+{\r
+    OPENCV_DEVICE_NAMESPACE_ transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);    \r
+}\r
+\r
+template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void min_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+template void min_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+{\r
+    OPENCV_DEVICE_NAMESPACE_ transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);    \r
+}\r
  \r
-    template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
-    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
-    template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
-    template void max_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
-    template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
-    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
+template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template void max_gpu<schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<schar>& dst, cudaStream_t stream);\r
+template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, ushort src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);\r
+template void max_gpu<short >(const DevMem2D_<short>& src1, short src2, const DevMem2D_<short>& dst, cudaStream_t stream);\r
+template void max_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);\r
+template void max_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);\r
+template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);\r
  \r
-    \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // threshold\r
  \r
-    namespace detail\r
-    {\r
-        template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>\r
-        {\r
-        };\r
-        template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_shift = 4 };\r
-        };\r
-        template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_block_dim_y = 4 };\r
-            enum { smart_shift = 4 };\r
-        };\r
-    }\r
+//////////////////////////////////////////////////////////////////////////\r
+// threshold\r
  \r
-    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >\r
-    {\r
-    };\r
-    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >\r
-    {\r
-    };\r
-    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >\r
+namespace detail\r
+{\r
+    template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>\r
      {\r
      };\r
-    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >\r
+    template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>\r
      {\r
+        enum { smart_shift = 4 };\r
      };\r
-    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >\r
+    template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>\r
      {\r
+        enum { smart_block_dim_y = 4 };\r
+        enum { smart_shift = 4 };\r
      };\r
+}\r
  \r
-    template <template <typename> class Op, typename T>\r
-    void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
-        cudaStream_t stream)\r
-    {\r
-        Op<T> op(thresh, maxVal);\r
-        transform(src, dst, op, stream);\r
-    }\r
-\r
-    template <typename T>\r
-    void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,\r
-        cudaStream_t stream)\r
-    {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
-            cudaStream_t stream);\r
+template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >\r
+{\r
+};\r
+template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >\r
+{\r
+};\r
  \r
-        static const caller_t callers[] = \r
-        {\r
-            threshold_caller<thresh_binary_func, T>, \r
-            threshold_caller<thresh_binary_inv_func, T>, \r
-            threshold_caller<thresh_trunc_func, T>, \r
-            threshold_caller<thresh_to_zero_func, T>, \r
-            threshold_caller<thresh_to_zero_inv_func, T>\r
-        };\r
-\r
-        callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);\r
-    }\r
+template <template <typename> class Op, typename T>\r
+void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
+    cudaStream_t stream)\r
+{\r
+    Op<T> op(thresh, maxVal);\r
+    OPENCV_DEVICE_NAMESPACE_ transform(src, dst, op, stream);\r
+}\r
  \r
-    template void threshold_gpu<uchar>(const DevMem2Db& src, const DevMem2Db& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);\r
-    template void threshold_gpu<schar>(const DevMem2Db& src, const DevMem2Db& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);\r
-    template void threshold_gpu<ushort>(const DevMem2Db& src, const DevMem2Db& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);\r
-    template void threshold_gpu<short>(const DevMem2Db& src, const DevMem2Db& dst, short thresh, short maxVal, int type, cudaStream_t stream);\r
-    template void threshold_gpu<int>(const DevMem2Db& src, const DevMem2Db& dst, int thresh, int maxVal, int type, cudaStream_t stream);\r
-    template void threshold_gpu<float>(const DevMem2Db& src, const DevMem2Db& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
-    template void threshold_gpu<double>(const DevMem2Db& src, const DevMem2Db& dst, double thresh, double maxVal, int type, cudaStream_t stream);\r
-\r
-\r
-\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // pow\r
-    \r
-    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>\r
-    {    \r
-        float power;\r
-        PowOp(float power_) : power(power_) {}\r
-            \r
-        __device__ __forceinline__ T operator()(const T& e) const\r
-        {      \r
-            return saturate_cast<T>(__powf((float)e, power));\r
-        }      \r
-    };\r
+template <typename T>\r
+void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,\r
+    cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, \r
+        cudaStream_t stream);\r
  \r
-    template<typename T> struct PowOp<T, true> : unary_function<T, T>\r
+    static const caller_t callers[] = \r
      {\r
-        float power;\r
-        PowOp(float power_) : power(power_) {}\r
-\r
-        __device__ __forceinline__ float operator()(const T& e) const\r
-        {\r
-            T res = saturate_cast<T>(__powf((float)e, power));            \r
-            \r
-            if ( (e < 0) && (1 & (int)power) )\r
-                    res *= -1;            \r
-            return res;         \r
-        }\r
+        threshold_caller<thresh_binary_func, T>, \r
+        threshold_caller<thresh_binary_inv_func, T>, \r
+        threshold_caller<thresh_trunc_func, T>, \r
+        threshold_caller<thresh_to_zero_func, T>, \r
+        threshold_caller<thresh_to_zero_inv_func, T>\r
      };\r
  \r
-    template<> struct PowOp<float> : unary_function<float, float>\r
-    {\r
-        float power;\r
-        PowOp(float power_) : power(power_) {}\r
-\r
-        __device__ __forceinline__ float operator()(const float& e) const\r
-        {\r
-            return __powf(::fabs(e), power);\r
-        }\r
-    };\r
+    callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);\r
+}\r
  \r
-    namespace detail\r
-    {\r
-        template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >\r
-        {\r
-        };\r
-        template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
-        {\r
-            enum { smart_block_dim_y = 8 };\r
-            enum { smart_shift = 8 };\r
-        };\r
-        template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
-        {\r
-            enum { smart_shift = 4 };\r
-        };\r
-        template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
-        {\r
-            enum { smart_block_dim_y = 4 };\r
-            enum { smart_shift = 4 };\r
-        };\r
-    }\r
+template void threshold_gpu<uchar>(const DevMem2Db& src, const DevMem2Db& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);\r
+template void threshold_gpu<schar>(const DevMem2Db& src, const DevMem2Db& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);\r
+template void threshold_gpu<ushort>(const DevMem2Db& src, const DevMem2Db& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);\r
+template void threshold_gpu<short>(const DevMem2Db& src, const DevMem2Db& dst, short thresh, short maxVal, int type, cudaStream_t stream);\r
+template void threshold_gpu<int>(const DevMem2Db& src, const DevMem2Db& dst, int thresh, int maxVal, int type, cudaStream_t stream);\r
+template void threshold_gpu<float>(const DevMem2Db& src, const DevMem2Db& dst, float thresh, float maxVal, int type, cudaStream_t stream);\r
+template void threshold_gpu<double>(const DevMem2Db& src, const DevMem2Db& dst, double thresh, double maxVal, int type, cudaStream_t stream);\r
  \r
-    template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>\r
-    {\r
-    };\r
  \r
-    template<typename T>\r
-    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)\r
-    {\r
-        transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), stream);\r
-    }   \r
  \r
-    template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-    template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-    template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-    template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-    template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-    template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
  \r
+//////////////////////////////////////////////////////////////////////////\r
+// pow\r
  \r
-    \r
+template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>\r
+{    \r
+    float power;\r
+    PowOp(float power_) : power(power_) {}\r
+        \r
+    __device__ __forceinline__ T operator()(const T& e) const\r
+    {      \r
+        return saturate_cast<T>(__powf((float)e, power));\r
+    }      \r
+};\r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // addWeighted\r
+template<typename T> struct PowOp<T, true> : unary_function<T, T>\r
+{\r
+    float power;\r
+    PowOp(float power_) : power(power_) {}\r
  \r
-    template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>\r
+    __device__ __forceinline__ float operator()(const T& e) const\r
      {\r
-        __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}\r
-\r
-        __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const\r
-        {\r
-            return saturate_cast<D>(alpha * a + beta * b + gamma);\r
-        }\r
+        T res = saturate_cast<T>(__powf((float)e, power));            \r
+        \r
+        if ( (e < 0) && (1 & (int)power) )\r
+                res *= -1;            \r
+        return res;         \r
+    }\r
+};\r
  \r
-        const double alpha;\r
-        const double beta;\r
-        const double gamma;\r
-    };\r
+template<> struct PowOp<float> : unary_function<float, float>\r
+{\r
+    float power;\r
+    PowOp(float power_) : power(power_) {}\r
  \r
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >\r
+    __device__ __forceinline__ float operator()(const float& e) const\r
      {\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >\r
-    {\r
-        enum { smart_shift = 4 };\r
-    };\r
+        return __powf(::fabs(e), power);\r
+    }\r
+};\r
  \r
-    template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >\r
-    {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
-    };\r
-    template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >\r
+namespace detail\r
+{\r
+    template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
      };\r
-    template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >\r
+    template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
      {\r
          enum { smart_block_dim_y = 8 };\r
-        enum { smart_shift = 4 };\r
+        enum { smart_shift = 8 };\r
      };\r
-    template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
+    template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
          enum { smart_shift = 4 };\r
      };\r
-    template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
+    template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >\r
      {\r
-        enum { smart_block_dim_y = 8 };\r
+        enum { smart_block_dim_y = 4 };\r
          enum { smart_shift = 4 };\r
      };\r
+}\r
+\r
+template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>\r
+{\r
+};\r
+\r
+template<typename T>\r
+void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)\r
+{\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), stream);\r
+}   \r
+\r
+template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+\r
  \r
-    template <typename T1, typename T2, typename D>\r
-    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)\r
-    {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
-        cudaSafeCall( cudaSetDoubleForDevice(&gamma) );\r
  \r
-        AddWeighted<T1, T2, D> op(alpha, beta, gamma);\r
  \r
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), static_cast< DevMem2D_<D> >(dst), op, stream);\r
+//////////////////////////////////////////////////////////////////////////\r
+// addWeighted\r
+\r
+template <typename T1, typename T2, typename D> struct AddWeighted : binary_function<T1, T2, D>\r
+{\r
+    __host__ __device__ __forceinline__ AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}\r
+\r
+    __device__ __forceinline__ D operator ()(typename TypeTraits<T1>::ParameterType a, typename TypeTraits<T2>::ParameterType b) const\r
+    {\r
+        return saturate_cast<D>(alpha * a + beta * b + gamma);\r
      }\r
  \r
-    template void addWeighted_gpu<uchar, uchar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, uchar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, uchar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, uchar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, uchar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, uchar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, uchar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<uchar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<uchar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<uchar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<uchar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<uchar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<uchar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<uchar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-    template void addWeighted_gpu<schar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<schar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<schar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<schar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<schar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<schar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<schar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-    template void addWeighted_gpu<ushort, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<ushort, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<ushort, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<ushort, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<ushort, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<ushort, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-\r
-\r
-    template void addWeighted_gpu<short, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<short, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<short, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<short, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<short, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    \r
-\r
-    template void addWeighted_gpu<int, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<int, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<int, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<int, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    \r
-\r
-    template void addWeighted_gpu<float, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    template void addWeighted_gpu<float, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<float, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-\r
-    \r
-\r
-    template void addWeighted_gpu<double, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<double, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<double, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<double, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<double, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<double, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void addWeighted_gpu<double, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-}}}\r
+    const double alpha;\r
+    const double beta;\r
+    const double gamma;\r
+};\r
+\r
+template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, short> >\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<ushort, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, ushort> >\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<ushort, short, short> > : DefaultTransformFunctorTraits< AddWeighted<ushort, short, short> >\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<short, short, ushort> > : DefaultTransformFunctorTraits< AddWeighted<short, short, ushort> >\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<short, short, short> > : DefaultTransformFunctorTraits< AddWeighted<short, short, short> >\r
+{\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+template <> struct TransformFunctorTraits< AddWeighted<int, int, int> > : DefaultTransformFunctorTraits< AddWeighted<int, int, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<int, int, float> > : DefaultTransformFunctorTraits< AddWeighted<int, int, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<int, float, int> > : DefaultTransformFunctorTraits< AddWeighted<int, float, int> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<int, float, float> > : DefaultTransformFunctorTraits< AddWeighted<int, float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<float, float, int> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+template <> struct TransformFunctorTraits< AddWeighted<float, float, float> > : DefaultTransformFunctorTraits< AddWeighted<float, float, float> >\r
+{\r
+    enum { smart_block_dim_y = 8 };\r
+    enum { smart_shift = 4 };\r
+};\r
+\r
+template <typename T1, typename T2, typename D>\r
+void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
+    cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
+    cudaSafeCall( cudaSetDoubleForDevice(&gamma) );\r
+\r
+    AddWeighted<T1, T2, D> op(alpha, beta, gamma);\r
+\r
+    OPENCV_DEVICE_NAMESPACE_ transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), static_cast< DevMem2D_<D> >(dst), op, stream);\r
+}\r
+\r
+template void addWeighted_gpu<uchar, uchar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, uchar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, uchar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, uchar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, uchar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, uchar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, uchar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<uchar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<uchar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<uchar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<uchar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<uchar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<uchar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<uchar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+template void addWeighted_gpu<schar, schar, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, schar, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, schar, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, schar, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, schar, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, schar, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, schar, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<schar, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<schar, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<schar, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<schar, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<schar, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<schar, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+template void addWeighted_gpu<ushort, ushort, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, ushort, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, ushort, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, ushort, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, ushort, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, ushort, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, ushort, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<ushort, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<ushort, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<ushort, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<ushort, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<ushort, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+template void addWeighted_gpu<short, short, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, short, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, short, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, short, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, short, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, short, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, short, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<short, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<short, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<short, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<short, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+template void addWeighted_gpu<int, int, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, int, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, int, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, int, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, int, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, int, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, int, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<int, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<int, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<int, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+template void addWeighted_gpu<float, float, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, float, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, float, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, float, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, float, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, float, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, float, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template void addWeighted_gpu<float, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<float, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+\r
+\r
+template void addWeighted_gpu<double, double, uchar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<double, double, schar>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<double, double, ushort>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<double, double, short>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<double, double, int>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<double, double, float>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+template void addWeighted_gpu<double, double, double>(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu

index c420c97..52d6154 100644 (file)
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -45,9 +45,7 @@
  #include "opencv2/gpu/device/utility.hpp"\r
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  \r
-using namespace cv::gpu;\r
-\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
  #define UINT_BITS 32U\r
  \r
@@ -67,154 +65,157 @@ using namespace cv::gpu::device;
  \r
  #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)\r
  \r
-namespace cv { namespace gpu { namespace histograms\r
-{\r
-    #if (!USE_SMEM_ATOMICS)\r
-\r
-        #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )\r
-\r
-        __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)\r
-        {\r
-            uint count;\r
-            do\r
-            {\r
-                count = s_WarpHist[data] & TAG_MASK;\r
-                count = threadTag | (count + 1);\r
-                s_WarpHist[data] = count;\r
-            } while (s_WarpHist[data] != count);\r
-        }\r
+namespace hist {\r
  \r
-    #else\r
+#if (!USE_SMEM_ATOMICS)\r
  \r
-        #define TAG_MASK 0xFFFFFFFFU\r
+    #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )\r
  \r
-        __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)\r
+    __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)\r
+    {\r
+        uint count;\r
+        do\r
          {\r
-            atomicAdd(s_WarpHist + data, 1);\r
-        }\r
+            count = s_WarpHist[data] & TAG_MASK;\r
+            count = threadTag | (count + 1);\r
+            s_WarpHist[data] = count;\r
+        } while (s_WarpHist[data] != count);\r
+    }\r
  \r
-    #endif\r
+#else\r
  \r
-    __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)\r
-    {\r
-        uint x = pos_x << 2;\r
+    #define TAG_MASK 0xFFFFFFFFU\r
  \r
-        if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);\r
-        if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);\r
-        if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);\r
-        if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);\r
+    __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)\r
+    {\r
+        atomicAdd(s_WarpHist + data, 1);\r
      }\r
  \r
-    __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)\r
-    {\r
-        //Per-warp subhistogram storage\r
-        __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];\r
-        uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;\r
+#endif\r
  \r
-        //Clear shared memory storage for current threadblock before processing\r
-        #pragma unroll\r
-        for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)\r
-           s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;\r
+__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)\r
+{\r
+    uint x = pos_x << 2;\r
  \r
-        //Cycle through the entire data set, update subhistograms for each warp\r
-        const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);\r
+    if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);\r
+    if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);\r
+    if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);\r
+    if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);\r
+}\r
  \r
-        __syncthreads();\r
-        const uint colsui = d_Data.step / sizeof(uint);\r
-        for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)\r
-        {\r
-            uint pos_y = pos / colsui;\r
-            uint pos_x = pos % colsui;\r
-            uint data = d_Data.ptr(pos_y)[pos_x];\r
-            addWord(s_WarpHist, data, tag, pos_x, cols);\r
-        }\r
+__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)\r
+{\r
+    //Per-warp subhistogram storage\r
+    __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];\r
+    uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;\r
  \r
-        //Merge per-warp histograms into per-block and write to global memory\r
-        __syncthreads();\r
-        for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)\r
-        {\r
-            uint sum = 0;\r
+    //Clear shared memory storage for current threadblock before processing\r
+    #pragma unroll\r
+    for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)\r
+       s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;\r
  \r
-            for (uint i = 0; i < WARP_COUNT; i++)\r
-                sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;\r
+    //Cycle through the entire data set, update subhistograms for each warp\r
+    const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);\r
  \r
-            d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;\r
-        }\r
+    __syncthreads();\r
+    const uint colsui = d_Data.step / sizeof(uint);\r
+    for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)\r
+    {\r
+        uint pos_y = pos / colsui;\r
+        uint pos_x = pos % colsui;\r
+        uint data = d_Data.ptr(pos_y)[pos_x];\r
+        addWord(s_WarpHist, data, tag, pos_x, cols);\r
      }\r
  \r
-    ////////////////////////////////////////////////////////////////////////////////\r
-    // Merge histogram256() output\r
-    // Run one threadblock per bin; each threadblock adds up the same bin counter\r
-    // from every partial histogram. Reads are uncoalesced, but mergeHistogram256\r
-    // takes only a fraction of total processing time\r
-    ////////////////////////////////////////////////////////////////////////////////\r
-\r
-    __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)\r
+    //Merge per-warp histograms into per-block and write to global memory\r
+    __syncthreads();\r
+    for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)\r
      {\r
          uint sum = 0;\r
  \r
-        #pragma unroll\r
-        for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)\r
-            sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];\r
+        for (uint i = 0; i < WARP_COUNT; i++)\r
+            sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;\r
  \r
-        __shared__ uint data[MERGE_THREADBLOCK_SIZE];\r
-        data[threadIdx.x] = sum;\r
+        d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;\r
+    }\r
+}\r
  \r
-        for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)\r
-        {\r
-            __syncthreads();\r
-            if(threadIdx.x < stride)\r
-                data[threadIdx.x] += data[threadIdx.x + stride];\r
-        }\r
+////////////////////////////////////////////////////////////////////////////////\r
+// Merge histogram256() output\r
+// Run one threadblock per bin; each threadblock adds up the same bin counter\r
+// from every partial histogram. Reads are uncoalesced, but mergeHistogram256\r
+// takes only a fraction of total processing time\r
+////////////////////////////////////////////////////////////////////////////////\r
  \r
-        if(threadIdx.x == 0)\r
-            d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);\r
-    }\r
+__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)\r
+{\r
+    uint sum = 0;\r
+\r
+    #pragma unroll\r
+    for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)\r
+        sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];\r
  \r
-    void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)\r
+    __shared__ uint data[MERGE_THREADBLOCK_SIZE];\r
+    data[threadIdx.x] = sum;\r
+\r
+    for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)\r
      {\r
-        histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(\r
-            DevMem2D_<uint>(src),\r
-            buf, \r
-            static_cast<uint>(src.rows * src.step / sizeof(uint)),\r
-            src.cols);\r
+        __syncthreads();\r
+        if(threadIdx.x < stride)\r
+            data[threadIdx.x] += data[threadIdx.x + stride];\r
+    }\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    if(threadIdx.x == 0)\r
+        d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);\r
+}\r
  \r
-        mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);\r
+void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)\r
+{\r
+    histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(\r
+        DevMem2D_<uint>(src),\r
+        buf, \r
+        static_cast<uint>(src.rows * src.step / sizeof(uint)),\r
+        src.cols);\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);\r
  \r
-    __constant__ int c_lut[256];\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-        if (x < src.cols && y < src.rows)\r
-        {\r
-            const uchar val = src.ptr(y)[x];\r
-            const int lut = c_lut[val];\r
-            dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);\r
-        }\r
-    }\r
+__constant__ int c_lut[256];\r
+\r
+__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)\r
+    if (x < src.cols && y < src.rows)\r
      {\r
-        dim3 block(16, 16);\r
-        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+        const uchar val = src.ptr(y)[x];\r
+        const int lut = c_lut[val];\r
+        dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);\r
+    }\r
+}\r
  \r
-        cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );\r
+void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)\r
+{\r
+    dim3 block(16, 16);\r
+    dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
  \r
-        equalizeHist<<<grid, block, 0, stream>>>(src, dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-}}}\r
+    equalizeHist<<<grid, block, 0, stream>>>(src, dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+} // namespace hist\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu

index 8ac6196..db43d74 100644 (file)
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -42,13 +42,15 @@
  \r
  #include "internal_shared.hpp"\r
  \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
  // Other values are not supported\r
  #define CELL_WIDTH 8\r
  #define CELL_HEIGHT 8\r
  #define CELLS_PER_BLOCK_X 2\r
  #define CELLS_PER_BLOCK_Y 2\r
  \r
-namespace cv { namespace gpu { namespace hog {\r
+namespace hog {\r
  \r
  __constant__ int cnbins;\r
  __constant__ int cblock_stride_x;\r
@@ -83,23 +85,23 @@ int power_2up(unsigned int n)
  void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
                        int nblocks_win_x, int nblocks_win_y)\r
  {\r
-    uploadConstant("cv::gpu::hog::cnbins", nbins);\r
-    uploadConstant("cv::gpu::hog::cblock_stride_x", block_stride_x);\r
-    uploadConstant("cv::gpu::hog::cblock_stride_y", block_stride_y);\r
-    uploadConstant("cv::gpu::hog::cnblocks_win_x", nblocks_win_x);\r
-    uploadConstant("cv::gpu::hog::cnblocks_win_y", nblocks_win_y);\r
+    cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); \r
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); \r
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); \r
+    cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );  \r
+    cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); \r
  \r
-    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;\r
-    uploadConstant("cv::gpu::hog::cblock_hist_size", block_hist_size);\r
+    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; \r
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); \r
  \r
-    int block_hist_size_2up = power_2up(block_hist_size);    \r
-    uploadConstant("cv::gpu::hog::cblock_hist_size_2up", block_hist_size_2up);\r
+    int block_hist_size_2up = power_2up(block_hist_size);  \r
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );\r
  \r
      int descr_width = nblocks_win_x * block_hist_size;\r
-    uploadConstant("cv::gpu::hog::cdescr_width", descr_width);\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );\r
  \r
      int descr_size = descr_width * nblocks_win_y;\r
-    uploadConstant("cv::gpu::hog::cdescr_size", descr_size);\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );\r
  }\r
  \r
  \r
@@ -153,10 +155,10 @@ __global__ void compute_hists_kernel_many_blocks(const int img_block_width, cons
              int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);\r
              int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);\r
  \r
-            float gaussian = expf(-(dist_center_y * dist_center_y + \r
-                                    dist_center_x * dist_center_x) * scale);\r
-            float interp_weight = (8.f - fabs(dist_y + 0.5f)) * \r
-                                  (8.f - fabs(dist_x + 0.5f)) / 64.f;\r
+            float gaussian = ::expf(-(dist_center_y * dist_center_y + \r
+                                      dist_center_x * dist_center_x) * scale);\r
+            float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * \r
+                                  (8.f - ::fabs(dist_x + 0.5f)) / 64.f;\r
  \r
              hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;\r
              hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;\r
@@ -273,15 +275,15 @@ __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
      __syncthreads();\r
      float sum = reduce_smem<nthreads>(squares);\r
      \r
-    float scale = 1.0f / (sqrtf(sum) + 0.1f * block_hist_size);        \r
-    elem = min(elem * scale, threshold);\r
+    float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);        \r
+    elem = ::min(elem * scale, threshold);\r
      \r
      __syncthreads();\r
      squares[threadIdx.x] = elem * elem;\r
  \r
      __syncthreads();\r
      sum = reduce_smem<nthreads>(squares);\r
-    scale = 1.0f / (sqrtf(sum) + 1e-3f);\r
+    scale = 1.0f / (::sqrtf(sum) + 1e-3f);\r
      \r
      if (threadIdx.x < block_hist_size)\r
          hist[0] = elem * scale;\r
@@ -533,7 +535,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
  \r
      if (threadIdx.x == 0)\r
      {\r
-        val = row[max(x - 1, 1)];\r
+        val = row[::max(x - 1, 1)];\r
          sh_row[0] = val.x;\r
          sh_row[(nthreads + 2)] = val.y;\r
          sh_row[2 * (nthreads + 2)] = val.z;\r
@@ -541,7 +543,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
  \r
      if (threadIdx.x == blockDim.x - 1)\r
      {\r
-        val = row[min(x + 1, width - 2)];\r
+        val = row[::min(x + 1, width - 2)];\r
          sh_row[blockDim.x + 1] = val.x;\r
          sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;\r
          sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;\r
@@ -561,7 +563,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
  \r
          float3 dx;\r
          if (correct_gamma)\r
-            dx = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z));    \r
+            dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));    \r
          else\r
              dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    \r
  \r
@@ -576,7 +578,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
              b = make_float3(val.x, val.y, val.z);\r
  \r
              if (correct_gamma)\r
-                dy = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z));\r
+                dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));\r
              else\r
                  dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);\r
          }\r
@@ -601,10 +603,10 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
              mag0 = mag1;\r
          }\r
  \r
-        mag0 = sqrtf(mag0);\r
+        mag0 = ::sqrtf(mag0);\r
  \r
-        float ang = (atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;\r
-        int hidx = (int)floorf(ang);\r
+        float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;\r
+        int hidx = (int)::floorf(ang);\r
          ang -= hidx;\r
          hidx = (hidx + cnbins) % cnbins;\r
  \r
@@ -648,10 +650,10 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
          sh_row[threadIdx.x + 1] = row[width - 2];\r
  \r
      if (threadIdx.x == 0)\r
-        sh_row[0] = row[max(x - 1, 1)];\r
+        sh_row[0] = row[::max(x - 1, 1)];\r
  \r
      if (threadIdx.x == blockDim.x - 1)\r
-        sh_row[blockDim.x + 1] = row[min(x + 1, width - 2)];\r
+        sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];\r
  \r
      __syncthreads();\r
      if (x < width)\r
@@ -659,7 +661,7 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
          float dx;\r
  \r
          if (correct_gamma)\r
-            dx = sqrtf(sh_row[threadIdx.x + 2]) - sqrtf(sh_row[threadIdx.x]);\r
+            dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);\r
          else\r
              dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];\r
  \r
@@ -669,14 +671,14 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
              float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];\r
              float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];\r
              if (correct_gamma)\r
-                dy = sqrtf(a) - sqrtf(b);\r
+                dy = ::sqrtf(a) - ::sqrtf(b);\r
              else\r
                  dy = a - b;\r
          }\r
-        float mag = sqrtf(dx * dx + dy * dy);\r
+        float mag = ::sqrtf(dx * dx + dy * dy);\r
  \r
-        float ang = (atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;\r
-        int hidx = (int)floorf(ang);\r
+        float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;\r
+        int hidx = (int)::floorf(ang);\r
          ang -= hidx;\r
          hidx = (hidx + cnbins) % cnbins;\r
  \r
@@ -768,4 +770,6 @@ static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
  void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }\r
  void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }\r
  \r
-}}}\r
+} // namespace hog \r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu

index 7b83362..0169566 100644 (file)
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -46,992 +46,990 @@
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  #include "opencv2/gpu/device/border_interpolate.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc {\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
-{\r
  /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////\r
  \r
-    texture<uchar4, 2> tex_meanshift;\r
+texture<uchar4, 2> tex_meanshift;\r
+\r
+__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, \r
+                                size_t out_step, int cols, int rows, \r
+                                int sp, int sr, int maxIter, float eps)\r
+{\r
+    int isr2 = sr*sr;\r
+    uchar4 c = tex2D(tex_meanshift, x0, y0 );\r
  \r
-    __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, \r
-                                    size_t out_step, int cols, int rows, \r
-                                    int sp, int sr, int maxIter, float eps)\r
+    // iterate meanshift procedure\r
+    for( int iter = 0; iter < maxIter; iter++ )\r
      {\r
-        int isr2 = sr*sr;\r
-        uchar4 c = tex2D(tex_meanshift, x0, y0 );\r
+        int count = 0;\r
+        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;\r
+        float icount;\r
  \r
-        // iterate meanshift procedure\r
-        for( int iter = 0; iter < maxIter; iter++ )\r
-        {\r
-            int count = 0;\r
-            int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;\r
-            float icount;\r
+        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)\r
+        int minx = x0-sp;\r
+        int miny = y0-sp;\r
+        int maxx = x0+sp;\r
+        int maxy = y0+sp;\r
  \r
-            //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)\r
-            int minx = x0-sp;\r
-            int miny = y0-sp;\r
-            int maxx = x0+sp;\r
-            int maxy = y0+sp;\r
+        for( int y = miny; y <= maxy; y++)\r
+        {\r
+            int rowCount = 0;\r
+            for( int x = minx; x <= maxx; x++ )\r
+            {                    \r
+                uchar4 t = tex2D( tex_meanshift, x, y );\r
  \r
-            for( int y = miny; y <= maxy; y++)\r
-            {\r
-                int rowCount = 0;\r
-                for( int x = minx; x <= maxx; x++ )\r
-                {                    \r
-                    uchar4 t = tex2D( tex_meanshift, x, y );\r
-\r
-                    int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);\r
-                    if( norm2 <= isr2 )\r
-                    {\r
-                        s0 += t.x; s1 += t.y; s2 += t.z;\r
-                        sx += x; rowCount++;\r
-                    }\r
+                int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);\r
+                if( norm2 <= isr2 )\r
+                {\r
+                    s0 += t.x; s1 += t.y; s2 += t.z;\r
+                    sx += x; rowCount++;\r
                  }\r
-                count += rowCount;\r
-                sy += y*rowCount;\r
              }\r
+            count += rowCount;\r
+            sy += y*rowCount;\r
+        }\r
  \r
-            if( count == 0 )\r
-                break;\r
+        if( count == 0 )\r
+            break;\r
  \r
-            icount = 1.f/count;\r
-            int x1 = __float2int_rz(sx*icount);\r
-            int y1 = __float2int_rz(sy*icount);\r
-            s0 = __float2int_rz(s0*icount);\r
-            s1 = __float2int_rz(s1*icount);\r
-            s2 = __float2int_rz(s2*icount);\r
+        icount = 1.f/count;\r
+        int x1 = __float2int_rz(sx*icount);\r
+        int y1 = __float2int_rz(sy*icount);\r
+        s0 = __float2int_rz(s0*icount);\r
+        s1 = __float2int_rz(s1*icount);\r
+        s2 = __float2int_rz(s2*icount);\r
  \r
-            int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);\r
+        int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);\r
  \r
-            bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1-x0) + abs(y1-y0) + norm2 <= eps);\r
+        bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);\r
  \r
-            x0 = x1; y0 = y1;\r
-            c.x = s0; c.y = s1; c.z = s2;\r
+        x0 = x1; y0 = y1;\r
+        c.x = s0; c.y = s1; c.z = s2;\r
  \r
-            if( stopFlag )\r
-                break;\r
-        }\r
+        if( stopFlag )\r
+            break;\r
+    }\r
  \r
-        int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);\r
-        *(uchar4*)(out + base) = c;\r
+    int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);\r
+    *(uchar4*)(out + base) = c;\r
  \r
-        return make_short2((short)x0, (short)y0);\r
-    }\r
+    return make_short2((short)x0, (short)y0);\r
+}\r
  \r
-    __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )\r
-    {\r
-        int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )\r
+{\r
+    int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if( x0 < cols && y0 < rows )\r
-            do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);\r
-    }\r
+    if( x0 < cols && y0 < rows )\r
+        do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);\r
+}\r
  \r
-    __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, \r
-                                         unsigned char* outsp, size_t outspstep, \r
-                                         int cols, int rows, \r
-                                         int sp, int sr, int maxIter, float eps)\r
-    {\r
-        int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, \r
+                                     unsigned char* outsp, size_t outspstep, \r
+                                     int cols, int rows, \r
+                                     int sp, int sr, int maxIter, float eps)\r
+{\r
+    int x0 = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y0 = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if( x0 < cols && y0 < rows )\r
-        {            \r
-            int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);\r
-            *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);\r
-        }\r
+    if( x0 < cols && y0 < rows )\r
+    {            \r
+        int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);\r
+        *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);\r
      }\r
+}\r
  \r
-    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)\r
-    {\r
-        dim3 grid(1, 1, 1);\r
-        dim3 threads(32, 8, 1);\r
-        grid.x = divUp(src.cols, threads.x);\r
-        grid.y = divUp(src.rows, threads.y);\r
+void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)\r
+{\r
+    dim3 grid(1, 1, 1);\r
+    dim3 threads(32, 8, 1);\r
+    grid.x = divUp(src.cols, threads.x);\r
+    grid.y = divUp(src.rows, threads.y);\r
  \r
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
-        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
+    cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
  \r
-        meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
-        cudaSafeCall( cudaGetLastError() );\r
+    meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
  \r
-        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
-    }\r
+    //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
+}\r
  \r
-    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) \r
-    {\r
-        dim3 grid(1, 1, 1);\r
-        dim3 threads(32, 8, 1);\r
-        grid.x = divUp(src.cols, threads.x);\r
-        grid.y = divUp(src.rows, threads.y);\r
+void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) \r
+{\r
+    dim3 grid(1, 1, 1);\r
+    dim3 threads(32, 8, 1);\r
+    grid.x = divUp(src.cols, threads.x);\r
+    grid.y = divUp(src.rows, threads.y);\r
  \r
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
-        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();\r
+    cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );\r
  \r
-        meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
-        cudaSafeCall( cudaGetLastError() );\r
+    meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
  \r
-        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
-    }\r
+    //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        \r
+}\r
  \r
  /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////\r
  \r
-    template <typename T>\r
-    __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)\r
-    {        \r
-        unsigned int H = ((ndisp-d) * 240)/ndisp;\r
+template <typename T>\r
+__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)\r
+{        \r
+    unsigned int H = ((ndisp-d) * 240)/ndisp;\r
  \r
-        unsigned int hi = (H/60) % 6;\r
-        float f = H/60.f - H/60;\r
-        float p = V * (1 - S);\r
-        float q = V * (1 - f * S);\r
-        float t = V * (1 - (1 - f) * S);\r
+    unsigned int hi = (H/60) % 6;\r
+    float f = H/60.f - H/60;\r
+    float p = V * (1 - S);\r
+    float q = V * (1 - f * S);\r
+    float t = V * (1 - (1 - f) * S);\r
  \r
-        float3 res;\r
-        \r
-        if (hi == 0) //R = V,  G = t,  B = p\r
-        {\r
-            res.x = p;\r
-            res.y = t;\r
-            res.z = V;\r
-        }\r
+    float3 res;\r
+    \r
+    if (hi == 0) //R = V,      G = t,  B = p\r
+    {\r
+        res.x = p;\r
+        res.y = t;\r
+        res.z = V;\r
+    }\r
  \r
-        if (hi == 1) // R = q, G = V,  B = p\r
-        {\r
-            res.x = p;\r
-            res.y = V;\r
-            res.z = q;\r
-        }        \r
+    if (hi == 1) // R = q,     G = V,  B = p\r
+    {\r
+        res.x = p;\r
+        res.y = V;\r
+        res.z = q;\r
+    }        \r
+    \r
+    if (hi == 2) // R = p,     G = V,  B = t\r
+    {\r
+        res.x = t;\r
+        res.y = V;\r
+        res.z = p;\r
+    }\r
          \r
-        if (hi == 2) // R = p, G = V,  B = t\r
-        {\r
-            res.x = t;\r
-            res.y = V;\r
-            res.z = p;\r
-        }\r
-            \r
-        if (hi == 3) // R = p, G = q,  B = V\r
-        {\r
-            res.x = V;\r
-            res.y = q;\r
-            res.z = p;\r
-        }\r
-\r
-        if (hi == 4) // R = t, G = p,  B = V\r
-        {\r
-            res.x = V;\r
-            res.y = p;\r
-            res.z = t;\r
-        }\r
-\r
-        if (hi == 5) // R = V, G = p,  B = q\r
-        {\r
-            res.x = q;\r
-            res.y = p;\r
-            res.z = V;\r
-        }\r
-        const unsigned int b = (unsigned int)(max(0.f, min (res.x, 1.f)) * 255.f);\r
-        const unsigned int g = (unsigned int)(max(0.f, min (res.y, 1.f)) * 255.f);\r
-        const unsigned int r = (unsigned int)(max(0.f, min (res.z, 1.f)) * 255.f);\r
-        const unsigned int a = 255U;\r
-\r
-        return (a << 24) + (r << 16) + (g << 8) + b;    \r
-    } \r
-\r
-    __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
+    if (hi == 3) // R = p,     G = q,  B = V\r
      {\r
-        const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        res.x = V;\r
+        res.y = q;\r
+        res.z = p;\r
+    }\r
  \r
-        if(x < width && y < height) \r
-        {\r
-            uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);\r
-\r
-            uint4 res;\r
-            res.x = cvtPixel(d4.x, ndisp);\r
-            res.y = cvtPixel(d4.y, ndisp);\r
-            res.z = cvtPixel(d4.z, ndisp);\r
-            res.w = cvtPixel(d4.w, ndisp);\r
-                    \r
-            uint4* line = (uint4*)(out_image + y * out_step);\r
-            line[x >> 2] = res;\r
-        }\r
+    if (hi == 4) // R = t,     G = p,  B = V\r
+    {\r
+        res.x = V;\r
+        res.y = p;\r
+        res.z = t;\r
      }\r
  \r
-    __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
+    if (hi == 5) // R = V,     G = p,  B = q\r
      {\r
-        const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        res.x = q;\r
+        res.y = p;\r
+        res.z = V;\r
+    }\r
+    const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);\r
+    const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);\r
+    const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);\r
+    const unsigned int a = 255U;\r
  \r
-        if(x < width && y < height) \r
-        {\r
-            short2 d2 = *(short2*)(disp + y * disp_step + x);\r
+    return (a << 24) + (r << 16) + (g << 8) + b;    \r
+} \r
  \r
-            uint2 res;\r
-            res.x = cvtPixel(d2.x, ndisp);            \r
-            res.y = cvtPixel(d2.y, ndisp);\r
+__global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
+{\r
+    const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-            uint2* line = (uint2*)(out_image + y * out_step);\r
-            line[x >> 1] = res;\r
-        }\r
+    if(x < width && y < height) \r
+    {\r
+        uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);\r
+\r
+        uint4 res;\r
+        res.x = cvtPixel(d4.x, ndisp);\r
+        res.y = cvtPixel(d4.y, ndisp);\r
+        res.z = cvtPixel(d4.z, ndisp);\r
+        res.w = cvtPixel(d4.w, ndisp);\r
+                \r
+        uint4* line = (uint4*)(out_image + y * out_step);\r
+        line[x >> 2] = res;\r
      }\r
+}\r
  \r
+__global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)\r
+{\r
+    const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-    void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
+    if(x < width && y < height) \r
      {\r
-        dim3 threads(16, 16, 1);\r
-        dim3 grid(1, 1, 1);\r
-        grid.x = divUp(src.cols, threads.x << 2);\r
-        grid.y = divUp(src.rows, threads.y);\r
-         \r
-        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() ); \r
-    }\r
+        short2 d2 = *(short2*)(disp + y * disp_step + x);\r
  \r
-    void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
-        grid.x = divUp(src.cols, threads.x << 1);\r
-        grid.y = divUp(src.rows, threads.y);\r
-         \r
-        drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);\r
-        cudaSafeCall( cudaGetLastError() );\r
-        \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        uint2 res;\r
+        res.x = cvtPixel(d2.x, ndisp);            \r
+        res.y = cvtPixel(d2.y, ndisp);\r
+\r
+        uint2* line = (uint2*)(out_image + y * out_step);\r
+        line[x >> 1] = res;\r
      }\r
+}\r
+\r
+\r
+void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
+{\r
+    dim3 threads(16, 16, 1);\r
+    dim3 grid(1, 1, 1);\r
+    grid.x = divUp(src.cols, threads.x << 2);\r
+    grid.y = divUp(src.rows, threads.y);\r
+     \r
+    drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() ); \r
+}\r
+\r
+void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
+    grid.x = divUp(src.cols, threads.x << 1);\r
+    grid.y = divUp(src.rows, threads.y);\r
+     \r
+    drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);\r
+    cudaSafeCall( cudaGetLastError() );\r
+    \r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////\r
  \r
-    __constant__ float cq[16];\r
+__constant__ float cq[16];\r
  \r
-    template <typename T>\r
-    __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)\r
-    {        \r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename T>\r
+__global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)\r
+{        \r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (y < rows && x < cols)\r
-        {\r
+    if (y < rows && x < cols)\r
+    {\r
  \r
-            float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];\r
-            float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];\r
+        float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];\r
+        float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];\r
  \r
-            qx += x * cq[0]; \r
-            qy += x * cq[4];\r
-            qz += x * cq[8];\r
-            qw += x * cq[12];\r
+        qx += x * cq[0]; \r
+        qy += x * cq[4];\r
+        qz += x * cq[8];\r
+        qw += x * cq[12];\r
  \r
-            T d = *(disp + disp_step * y + x);\r
+        T d = *(disp + disp_step * y + x);\r
  \r
-            float iW = 1.f / (qw + cq[14] * d);\r
-            float4 v;\r
-            v.x = (qx + cq[2] * d) * iW;\r
-            v.y = (qy + cq[6] * d) * iW;\r
-            v.z = (qz + cq[10] * d) * iW;\r
-            v.w = 1.f;\r
+        float iW = 1.f / (qw + cq[14] * d);\r
+        float4 v;\r
+        v.x = (qx + cq[2] * d) * iW;\r
+        v.y = (qy + cq[6] * d) * iW;\r
+        v.z = (qz + cq[10] * d) * iW;\r
+        v.w = 1.f;\r
  \r
-            *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;\r
-        }\r
+        *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;\r
      }\r
+}\r
  \r
-    template <typename T>\r
-    inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
-        grid.x = divUp(disp.cols, threads.x);\r
-        grid.y = divUp(disp.rows, threads.y);\r
+template <typename T>\r
+inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
+    grid.x = divUp(disp.cols, threads.x);\r
+    grid.y = divUp(disp.rows, threads.y);\r
  \r
-        cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );\r
  \r
-        reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
-    {\r
-        reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
-    }\r
+void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
+{\r
+    reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
+}\r
  \r
-    void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
-    {\r
-        reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
-    }\r
+void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)\r
+{\r
+    reprojectImageTo3D_caller(disp, xyzw, q, stream);\r
+}\r
  \r
  //////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////\r
  \r
-    __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, \r
-                                          const PtrStepf Dy, PtrStepf dst)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, \r
+                                      const PtrStepf Dy, PtrStepf dst)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows)\r
-        {            \r
-            float dx = Dx.ptr(y)[x];\r
-            float dy = Dy.ptr(y)[x];\r
+    if (x < cols && y < rows)\r
+    {            \r
+        float dx = Dx.ptr(y)[x];\r
+        float dy = Dy.ptr(y)[x];\r
  \r
-            dst.ptr(y)[x] = dx * dx;\r
-            dst.ptr(y + rows)[x] = dx * dy;\r
-            dst.ptr(y + (rows << 1))[x] = dy * dy;\r
-        }\r
+        dst.ptr(y)[x] = dx * dx;\r
+        dst.ptr(y + rows)[x] = dx * dy;\r
+        dst.ptr(y + (rows << 1))[x] = dy * dy;\r
      }\r
+}\r
  \r
-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8);\r
-        dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
+void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8);\r
+    dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));\r
  \r
-        extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////\r
  \r
-    texture<float, 2> harrisDxTex;\r
-    texture<float, 2> harrisDyTex;\r
+texture<float, 2> harrisDxTex;\r
+texture<float, 2> harrisDyTex;\r
  \r
-    __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
-                                        PtrStepb dst)\r
-    {\r
-        const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
+                                    PtrStepb dst)\r
+{\r
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows)\r
-        {\r
-            float a = 0.f;\r
-            float b = 0.f;\r
-            float c = 0.f;\r
+    if (x < cols && y < rows)\r
+    {\r
+        float a = 0.f;\r
+        float b = 0.f;\r
+        float c = 0.f;\r
  \r
-            const int ibegin = y - (block_size / 2);\r
-            const int jbegin = x - (block_size / 2);\r
-            const int iend = ibegin + block_size;\r
-            const int jend = jbegin + block_size;\r
+        const int ibegin = y - (block_size / 2);\r
+        const int jbegin = x - (block_size / 2);\r
+        const int iend = ibegin + block_size;\r
+        const int jend = jbegin + block_size;\r
  \r
-            for (int i = ibegin; i < iend; ++i)\r
+        for (int i = ibegin; i < iend; ++i)\r
+        {\r
+            for (int j = jbegin; j < jend; ++j)\r
              {\r
-                for (int j = jbegin; j < jend; ++j)\r
-                {\r
-                    float dx = tex2D(harrisDxTex, j, i);\r
-                    float dy = tex2D(harrisDyTex, j, i);\r
-                    a += dx * dx;\r
-                    b += dx * dy;\r
-                    c += dy * dy;\r
-                }\r
+                float dx = tex2D(harrisDxTex, j, i);\r
+                float dy = tex2D(harrisDyTex, j, i);\r
+                a += dx * dx;\r
+                b += dx * dy;\r
+                c += dy * dy;\r
              }\r
-\r
-            ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
          }\r
+\r
+        ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
      }\r
+}\r
  \r
-    template <typename BR, typename BC>\r
-    __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
-                                        PtrStepb dst, BR border_row, BC border_col)\r
-    {\r
-        const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename BR, typename BC>\r
+__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,\r
+                                    PtrStepb dst, BR border_row, BC border_col)\r
+{\r
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows)\r
-        {\r
-            float a = 0.f;\r
-            float b = 0.f;\r
-            float c = 0.f;\r
+    if (x < cols && y < rows)\r
+    {\r
+        float a = 0.f;\r
+        float b = 0.f;\r
+        float c = 0.f;\r
  \r
-            const int ibegin = y - (block_size / 2);\r
-            const int jbegin = x - (block_size / 2);\r
-            const int iend = ibegin + block_size;\r
-            const int jend = jbegin + block_size;\r
+        const int ibegin = y - (block_size / 2);\r
+        const int jbegin = x - (block_size / 2);\r
+        const int iend = ibegin + block_size;\r
+        const int jend = jbegin + block_size;\r
  \r
-            for (int i = ibegin; i < iend; ++i)\r
+        for (int i = ibegin; i < iend; ++i)\r
+        {\r
+            int y = border_col.idx_row(i);\r
+            for (int j = jbegin; j < jend; ++j)\r
              {\r
-                int y = border_col.idx_row(i);\r
-                for (int j = jbegin; j < jend; ++j)\r
-                {\r
-                    int x = border_row.idx_col(j);\r
-                    float dx = tex2D(harrisDxTex, x, y);\r
-                    float dy = tex2D(harrisDyTex, x, y);\r
-                    a += dx * dx;\r
-                    b += dx * dy;\r
-                    c += dy * dy;\r
-                }\r
+                int x = border_row.idx_col(j);\r
+                float dx = tex2D(harrisDxTex, x, y);\r
+                float dy = tex2D(harrisDyTex, x, y);\r
+                a += dx * dx;\r
+                b += dx * dy;\r
+                c += dy * dy;\r
              }\r
-\r
-            ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
          }\r
-    }\r
  \r
-    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, \r
-                             int border_type, cudaStream_t stream)\r
-    {\r
-        const int rows = Dx.rows;\r
-        const int cols = Dx.cols;\r
+        ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);\r
+    }\r
+}\r
  \r
-        dim3 threads(32, 8);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, \r
+                         int border_type, cudaStream_t stream)\r
+{\r
+    const int rows = Dx.rows;\r
+    const int cols = Dx.cols;\r
  \r
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
-        cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
-        cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
-        harrisDxTex.filterMode = cudaFilterModePoint;\r
-        harrisDyTex.filterMode = cudaFilterModePoint;\r
+    dim3 threads(32, 8);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        switch (border_type) \r
-        {\r
-        case BORDER_REFLECT101_GPU:\r
-            cornerHarris_kernel<<<grid, threads, 0, stream>>>(\r
-                    cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
-            break;\r
-        case BORDER_REPLICATE_GPU:\r
-            harrisDxTex.addressMode[0] = cudaAddressModeClamp;\r
-            harrisDxTex.addressMode[1] = cudaAddressModeClamp;\r
-            harrisDyTex.addressMode[0] = cudaAddressModeClamp;\r
-            harrisDyTex.addressMode[1] = cudaAddressModeClamp;\r
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
+    cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
+    cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
+    harrisDxTex.filterMode = cudaFilterModePoint;\r
+    harrisDyTex.filterMode = cudaFilterModePoint;\r
  \r
-            cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);\r
-            break;\r
-        }\r
+    switch (border_type) \r
+    {\r
+    case BORDER_REFLECT101_GPU:\r
+        cornerHarris_kernel<<<grid, threads, 0, stream>>>(\r
+                cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
+        break;\r
+    case BORDER_REPLICATE_GPU:\r
+        harrisDxTex.addressMode[0] = cudaAddressModeClamp;\r
+        harrisDxTex.addressMode[1] = cudaAddressModeClamp;\r
+        harrisDyTex.addressMode[0] = cudaAddressModeClamp;\r
+        harrisDyTex.addressMode[1] = cudaAddressModeClamp;\r
+\r
+        cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);\r
+        break;\r
+    }\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
  \r
-        //cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
-        //cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
-    }\r
+    //cudaSafeCall(cudaUnbindTexture(harrisDxTex));\r
+    //cudaSafeCall(cudaUnbindTexture(harrisDyTex));\r
+}\r
  \r
  /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////\r
  \r
-    texture<float, 2> minEigenValDxTex;\r
-    texture<float, 2> minEigenValDyTex;\r
+texture<float, 2> minEigenValDxTex;\r
+texture<float, 2> minEigenValDyTex;\r
  \r
-    __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
-                                             PtrStepb dst)\r
-    {\r
-        const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
+                                         PtrStepb dst)\r
+{\r
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows)\r
-        {\r
-            float a = 0.f;\r
-            float b = 0.f;\r
-            float c = 0.f;\r
+    if (x < cols && y < rows)\r
+    {\r
+        float a = 0.f;\r
+        float b = 0.f;\r
+        float c = 0.f;\r
  \r
-            const int ibegin = y - (block_size / 2);\r
-            const int jbegin = x - (block_size / 2);\r
-            const int iend = ibegin + block_size;\r
-            const int jend = jbegin + block_size;\r
+        const int ibegin = y - (block_size / 2);\r
+        const int jbegin = x - (block_size / 2);\r
+        const int iend = ibegin + block_size;\r
+        const int jend = jbegin + block_size;\r
  \r
-            for (int i = ibegin; i < iend; ++i)\r
+        for (int i = ibegin; i < iend; ++i)\r
+        {\r
+            for (int j = jbegin; j < jend; ++j)\r
              {\r
-                for (int j = jbegin; j < jend; ++j)\r
-                {\r
-                    float dx = tex2D(minEigenValDxTex, j, i);\r
-                    float dy = tex2D(minEigenValDyTex, j, i);\r
-                    a += dx * dx;\r
-                    b += dx * dy;\r
-                    c += dy * dy;\r
-                }\r
+                float dx = tex2D(minEigenValDxTex, j, i);\r
+                float dy = tex2D(minEigenValDyTex, j, i);\r
+                a += dx * dx;\r
+                b += dx * dy;\r
+                c += dy * dy;\r
              }\r
-\r
-            a *= 0.5f;\r
-            c *= 0.5f;\r
-            ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
          }\r
+\r
+        a *= 0.5f;\r
+        c *= 0.5f;\r
+        ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
      }\r
+}\r
  \r
  \r
-    template <typename BR, typename BC>\r
-    __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
-                                             PtrStepb dst, BR border_row, BC border_col)\r
-    {\r
-        const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename BR, typename BC>\r
+__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, \r
+                                         PtrStepb dst, BR border_row, BC border_col)\r
+{\r
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < cols && y < rows)\r
-        {\r
-            float a = 0.f;\r
-            float b = 0.f;\r
-            float c = 0.f;\r
+    if (x < cols && y < rows)\r
+    {\r
+        float a = 0.f;\r
+        float b = 0.f;\r
+        float c = 0.f;\r
  \r
-            const int ibegin = y - (block_size / 2);\r
-            const int jbegin = x - (block_size / 2);\r
-            const int iend = ibegin + block_size;\r
-            const int jend = jbegin + block_size;\r
+        const int ibegin = y - (block_size / 2);\r
+        const int jbegin = x - (block_size / 2);\r
+        const int iend = ibegin + block_size;\r
+        const int jend = jbegin + block_size;\r
  \r
-            for (int i = ibegin; i < iend; ++i)\r
+        for (int i = ibegin; i < iend; ++i)\r
+        {\r
+            int y = border_col.idx_row(i);\r
+            for (int j = jbegin; j < jend; ++j)\r
              {\r
-                int y = border_col.idx_row(i);\r
-                for (int j = jbegin; j < jend; ++j)\r
-                {\r
-                    int x = border_row.idx_col(j);\r
-                    float dx = tex2D(minEigenValDxTex, x, y);\r
-                    float dy = tex2D(minEigenValDyTex, x, y);\r
-                    a += dx * dx;\r
-                    b += dx * dy;\r
-                    c += dy * dy;\r
-                }\r
+                int x = border_row.idx_col(j);\r
+                float dx = tex2D(minEigenValDxTex, x, y);\r
+                float dy = tex2D(minEigenValDyTex, x, y);\r
+                a += dx * dx;\r
+                b += dx * dy;\r
+                c += dy * dy;\r
              }\r
-\r
-            a *= 0.5f;\r
-            c *= 0.5f;\r
-            ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
          }\r
-    }\r
  \r
-    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,\r
-                                  int border_type, cudaStream_t stream)\r
-    {\r
-        const int rows = Dx.rows;\r
-        const int cols = Dx.cols;\r
+        a *= 0.5f;\r
+        c *= 0.5f;\r
+        ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);\r
+    }\r
+}\r
  \r
-        dim3 threads(32, 8);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,\r
+                              int border_type, cudaStream_t stream)\r
+{\r
+    const int rows = Dx.rows;\r
+    const int cols = Dx.cols;\r
  \r
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
-        cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
-        cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
-        minEigenValDxTex.filterMode = cudaFilterModePoint;\r
-        minEigenValDyTex.filterMode = cudaFilterModePoint;\r
+    dim3 threads(32, 8);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        switch (border_type)\r
-        {\r
-        case BORDER_REFLECT101_GPU:\r
-            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(\r
-                    cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
-            break;\r
-        case BORDER_REPLICATE_GPU:\r
-            minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;\r
-            minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;\r
-            minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;\r
-            minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;\r
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();\r
+    cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step);\r
+    cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step);\r
+    minEigenValDxTex.filterMode = cudaFilterModePoint;\r
+    minEigenValDyTex.filterMode = cudaFilterModePoint;\r
  \r
-            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);\r
-            break;\r
-        }\r
+    switch (border_type)\r
+    {\r
+    case BORDER_REFLECT101_GPU:\r
+        cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(\r
+                cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));\r
+        break;\r
+    case BORDER_REPLICATE_GPU:\r
+        minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;\r
+        minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;\r
+        minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;\r
+        minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;\r
+\r
+        cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);\r
+        break;\r
+    }\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
  \r
-        //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
-        //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
-    }\r
+    //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));\r
+    //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));\r
+}\r
  \r
  ////////////////////////////// Column Sum //////////////////////////////////////\r
  \r
-    __global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)\r
+__global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+\r
+    if (x < cols)\r
      {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+        const unsigned char* src_data = src.data + x * sizeof(float);\r
+        unsigned char* dst_data = dst.data + x * sizeof(float);\r
  \r
-        if (x < cols)\r
+        float sum = 0.f;\r
+        for (int y = 0; y < rows; ++y)\r
          {\r
-            const unsigned char* src_data = src.data + x * sizeof(float);\r
-            unsigned char* dst_data = dst.data + x * sizeof(float);\r
-\r
-            float sum = 0.f;\r
-            for (int y = 0; y < rows; ++y)\r
-            {\r
-                sum += *(const float*)src_data;\r
-                *(float*)dst_data = sum;\r
-                src_data += src.step;\r
-                dst_data += dst.step;\r
-            }\r
+            sum += *(const float*)src_data;\r
+            *(float*)dst_data = sum;\r
+            src_data += src.step;\r
+            dst_data += dst.step;\r
          }\r
      }\r
+}\r
  \r
  \r
-    void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)\r
-    {\r
-        dim3 threads(256);\r
-        dim3 grid(divUp(src.cols, threads.x));\r
+void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)\r
+{\r
+    dim3 threads(256);\r
+    dim3 grid(divUp(src.cols, threads.x));\r
  \r
-        column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // mulSpectrums\r
+//////////////////////////////////////////////////////////////////////////\r
+// mulSpectrums\r
  \r
-    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
+__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
  \r
-        if (x < c.cols && y < c.rows) \r
-        {\r
-            c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
-        }\r
+    if (x < c.cols && y < c.rows) \r
+    {\r
+        c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
      }\r
+}\r
  \r
  \r
-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-    {\r
-        dim3 threads(256);\r
-        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+{\r
+    dim3 threads(256);\r
+    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
  \r
-        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // mulSpectrums_CONJ\r
+//////////////////////////////////////////////////////////////////////////\r
+// mulSpectrums_CONJ\r
  \r
-    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
+__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;    \r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;    \r
  \r
-        if (x < c.cols && y < c.rows) \r
-        {\r
-            c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
-        }\r
+    if (x < c.cols && y < c.rows) \r
+    {\r
+        c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
      }\r
+}\r
  \r
  \r
-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-    {\r
-        dim3 threads(256);\r
-        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+{\r
+    dim3 threads(256);\r
+    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
  \r
-        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // mulAndScaleSpectrums\r
+//////////////////////////////////////////////////////////////////////////\r
+// mulAndScaleSpectrums\r
  \r
-    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < c.cols && y < c.rows) \r
-        {\r
-            cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
-            c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
-        }\r
+    if (x < c.cols && y < c.rows) \r
+    {\r
+        cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);\r
+        c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
      }\r
+}\r
  \r
  \r
-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-    {\r
-        dim3 threads(256);\r
-        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+{\r
+    dim3 threads(256);\r
+    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
  \r
-        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // mulAndScaleSpectrums_CONJ\r
+//////////////////////////////////////////////////////////////////////////\r
+// mulAndScaleSpectrums_CONJ\r
  \r
-    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < c.cols && y < c.rows) \r
-        {\r
-            cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
-            c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
-        }\r
+    if (x < c.cols && y < c.rows) \r
+    {\r
+        cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));\r
+        c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);\r
      }\r
+}\r
  \r
  \r
-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
-    {\r
-        dim3 threads(256);\r
-        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
+void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)\r
+{\r
+    dim3 threads(256);\r
+    dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));\r
  \r
-        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }    \r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}    \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // buildWarpMaps\r
+//////////////////////////////////////////////////////////////////////////\r
+// buildWarpMaps\r
  \r
-    // TODO use intrinsics like __sinf and so on\r
+// TODO use intrinsics like __sinf and so on\r
  \r
-    namespace build_warp_maps\r
-    {\r
+namespace build_warp_maps\r
+{\r
  \r
-        __constant__ float ck_rinv[9];\r
-        __constant__ float cr_kinv[9];\r
-        __constant__ float ct[3];\r
-        __constant__ float cscale;\r
-    }\r
+    __constant__ float ck_rinv[9];\r
+    __constant__ float cr_kinv[9];\r
+    __constant__ float ct[3];\r
+    __constant__ float cscale;\r
+}\r
  \r
  \r
-    class PlaneMapper\r
+class PlaneMapper\r
+{\r
+public:\r
+    static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
      {\r
-    public:\r
-        static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
-        {\r
-            using namespace build_warp_maps;\r
+        using namespace build_warp_maps;\r
  \r
-            float x_ = u / cscale - ct[0];\r
-            float y_ = v / cscale - ct[1];\r
+        float x_ = u / cscale - ct[0];\r
+        float y_ = v / cscale - ct[1];\r
  \r
-            float z;\r
-            x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);\r
-            y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);\r
-            z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);\r
+        float z;\r
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);\r
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);\r
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);\r
  \r
-            x /= z;\r
-            y /= z;\r
-        }\r
-    };\r
+        x /= z;\r
+        y /= z;\r
+    }\r
+};\r
  \r
  \r
-    class CylindricalMapper\r
+class CylindricalMapper\r
+{\r
+public:\r
+    static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
      {\r
-    public:\r
-        static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
-        {\r
-            using namespace build_warp_maps;\r
+        using namespace build_warp_maps;\r
  \r
-            u /= cscale;\r
-            float x_ = sinf(u);\r
-            float y_ = v / cscale;\r
-            float z_ = cosf(u);\r
+        u /= cscale;\r
+        float x_ = ::sinf(u);\r
+        float y_ = v / cscale;\r
+        float z_ = ::cosf(u);\r
  \r
-            float z;\r
-            x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
-            y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
-            z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
+        float z;\r
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
  \r
-            if (z > 0) { x /= z; y /= z; }\r
-            else x = y = -1;\r
-        }\r
-    };\r
+        if (z > 0) { x /= z; y /= z; }\r
+        else x = y = -1;\r
+    }\r
+};\r
  \r
  \r
-    class SphericalMapper\r
+class SphericalMapper\r
+{\r
+public:\r
+    static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
      {\r
-    public:\r
-        static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)\r
-        {\r
-            using namespace build_warp_maps;\r
+        using namespace build_warp_maps;\r
  \r
-            v /= cscale;\r
-            u /= cscale;\r
+        v /= cscale;\r
+        u /= cscale;\r
  \r
-            float sinv = sinf(v);\r
-            float x_ = sinv * sinf(u);\r
-            float y_ = -cosf(v);\r
-            float z_ = sinv * cosf(u);\r
+        float sinv = ::sinf(v);\r
+        float x_ = sinv * ::sinf(u);\r
+        float y_ = -::cosf(v);\r
+        float z_ = sinv * ::cosf(u);\r
  \r
-            float z;\r
-            x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
-            y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
-            z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
+        float z;\r
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;\r
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;\r
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;\r
  \r
-            if (z > 0) { x /= z; y /= z; }\r
-            else x = y = -1;\r
-        }\r
-    };\r
+        if (z > 0) { x /= z; y /= z; }\r
+        else x = y = -1;\r
+    }\r
+};\r
  \r
  \r
-    template <typename Mapper>\r
-    __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,\r
-                                        PtrStepf map_x, PtrStepf map_y)\r
+template <typename Mapper>\r
+__global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,\r
+                                    PtrStepf map_x, PtrStepf map_y)\r
+{\r
+    int du = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int dv = blockIdx.y * blockDim.y + threadIdx.y;\r
+    if (du < cols && dv < rows)\r
      {\r
-        int du = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int dv = blockIdx.y * blockDim.y + threadIdx.y;\r
-        if (du < cols && dv < rows)\r
-        {\r
-            float u = tl_u + du;\r
-            float v = tl_v + dv;\r
-            float x, y;\r
-            Mapper::mapBackward(u, v, x, y);\r
-            map_x.ptr(dv)[du] = x;\r
-            map_y.ptr(dv)[du] = y;\r
-        }\r
+        float u = tl_u + du;\r
+        float v = tl_v + dv;\r
+        float x, y;\r
+        Mapper::mapBackward(u, v, x, y);\r
+        map_x.ptr(dv)[du] = x;\r
+        map_y.ptr(dv)[du] = y;\r
      }\r
+}\r
  \r
  \r
-    void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
-                            const float k_rinv[9], const float r_kinv[9], const float t[3], \r
-                            float scale, cudaStream_t stream)\r
-    {\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
+void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+                        const float k_rinv[9], const float r_kinv[9], const float t[3], \r
+                        float scale, cudaStream_t stream)\r
+{\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
  \r
-        int cols = map_x.cols;\r
-        int rows = map_x.rows;\r
+    int cols = map_x.cols;\r
+    int rows = map_x.rows;\r
  \r
-        dim3 threads(32, 8);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+    dim3 threads(32, 8);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
-        cudaSafeCall(cudaGetLastError());\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+    buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
+    cudaSafeCall(cudaGetLastError());\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
  \r
  \r
-    void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
-                                  const float k_rinv[9], const float r_kinv[9], float scale,\r
-                                  cudaStream_t stream)\r
-    {\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
+void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+                              const float k_rinv[9], const float r_kinv[9], float scale,\r
+                              cudaStream_t stream)\r
+{\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
  \r
-        int cols = map_x.cols;\r
-        int rows = map_x.rows;\r
+    int cols = map_x.cols;\r
+    int rows = map_x.rows;\r
  \r
-        dim3 threads(32, 8);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+    dim3 threads(32, 8);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
-        cudaSafeCall(cudaGetLastError());\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+    buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
+    cudaSafeCall(cudaGetLastError());\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
  \r
  \r
-    void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
-                                const float k_rinv[9], const float r_kinv[9], float scale,\r
-                                cudaStream_t stream)\r
-    {\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
-        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
+void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
+                            const float k_rinv[9], const float r_kinv[9], float scale,\r
+                            cudaStream_t stream)\r
+{\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));\r
+    cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));\r
  \r
-        int cols = map_x.cols;\r
-        int rows = map_x.rows;\r
+    int cols = map_x.cols;\r
+    int rows = map_x.rows;\r
  \r
-        dim3 threads(32, 8);\r
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
+    dim3 threads(32, 8);\r
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));\r
  \r
-        buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
-        cudaSafeCall(cudaGetLastError());\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+    buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);\r
+    cudaSafeCall(cudaGetLastError());\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // convolve\r
+//////////////////////////////////////////////////////////////////////////\r
+// convolve\r
  \r
-    #define CONVOLVE_MAX_KERNEL_SIZE 17\r
+#define CONVOLVE_MAX_KERNEL_SIZE 17\r
  \r
-    __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];\r
+__constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];\r
  \r
-    __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)\r
+__global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)\r
+{\r
+    __shared__ float smem[16 + 2 * 8][16 + 2 * 8];\r
+\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    // x | x 0 | 0\r
+    // -----------\r
+    // x | x 0 | 0\r
+    // 0 | 0 0 | 0\r
+    // -----------\r
+    // 0 | 0 0 | 0\r
+    smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];\r
+\r
+    // 0 | 0 x | x\r
+    // -----------\r
+    // 0 | 0 x | x\r
+    // 0 | 0 0 | 0\r
+    // -----------\r
+    // 0 | 0 0 | 0\r
+    smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];\r
+\r
+    // 0 | 0 0 | 0\r
+    // -----------\r
+    // 0 | 0 0 | 0\r
+    // x | x 0 | 0\r
+    // -----------\r
+    // x | x 0 | 0\r
+    smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];\r
+\r
+    // 0 | 0 0 | 0\r
+    // -----------\r
+    // 0 | 0 0 | 0\r
+    // 0 | 0 x | x\r
+    // -----------\r
+    // 0 | 0 x | x\r
+    smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];\r
+\r
+    __syncthreads();\r
+\r
+    if (x < src.cols && y < src.rows)\r
      {\r
-        __shared__ float smem[16 + 2 * 8][16 + 2 * 8];\r
-\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        // x | x 0 | 0\r
-        // -----------\r
-        // x | x 0 | 0\r
-        // 0 | 0 0 | 0\r
-        // -----------\r
-        // 0 | 0 0 | 0\r
-        smem[threadIdx.y][threadIdx.x] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(max(x - 8, 0), src.cols - 1)];\r
-\r
-        // 0 | 0 x | x\r
-        // -----------\r
-        // 0 | 0 x | x\r
-        // 0 | 0 0 | 0\r
-        // -----------\r
-        // 0 | 0 0 | 0\r
-        smem[threadIdx.y][threadIdx.x + 16] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(x + 8, src.cols - 1)];\r
-\r
-        // 0 | 0 0 | 0\r
-        // -----------\r
-        // 0 | 0 0 | 0\r
-        // x | x 0 | 0\r
-        // -----------\r
-        // x | x 0 | 0\r
-        smem[threadIdx.y + 16][threadIdx.x] = src.ptr(min(y + 8, src.rows - 1))[min(max(x - 8, 0), src.cols - 1)];\r
-\r
-        // 0 | 0 0 | 0\r
-        // -----------\r
-        // 0 | 0 0 | 0\r
-        // 0 | 0 x | x\r
-        // -----------\r
-        // 0 | 0 x | x\r
-        smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(min(y + 8, src.rows - 1))[min(x + 8, src.cols - 1)];\r
-\r
-        __syncthreads();\r
-\r
-        if (x < src.cols && y < src.rows)\r
-        {\r
-            float res = 0;\r
+        float res = 0;\r
  \r
-            for (int i = 0; i < kHeight; ++i)\r
+        for (int i = 0; i < kHeight; ++i)\r
+        {\r
+            for (int j = 0; j < kWidth; ++j)\r
              {\r
-                for (int j = 0; j < kWidth; ++j)\r
-                {\r
-                    res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];\r
-                }\r
+                res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];\r
              }\r
-\r
-            dst.ptr(y)[x] = res;\r
          }\r
-    }\r
-\r
-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)\r
-    {\r
-        cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
  \r
-        const dim3 block(16, 16);\r
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
+        dst.ptr(y)[x] = res;\r
+    }\r
+}\r
  \r
-        convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);\r
-        cudaSafeCall(cudaGetLastError());\r
+void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)\r
+{\r
+    cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+    const dim3 block(16, 16);\r
+    const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));\r
  \r
+    convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);\r
+    cudaSafeCall(cudaGetLastError());\r
  \r
-}}}\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
  \r
+} // namespace imgproc\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/internal_shared.hpp b/modules/gpu/src/cuda/internal_shared.hpp

index 1d13735..7c27675 100644 (file)
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -43,11 +43,15 @@
  #ifndef __OPENCV_internal_shared_HPP__\r
  #define __OPENCV_internal_shared_HPP__\r
  \r
+#include <cuda_runtime.h>\r
+#include <npp.h>\r
+#include "NPP_staging.hpp"\r
  #include "opencv2/gpu/devmem2d.hpp"\r
  #include "safe_call.hpp"\r
-#include "cuda_runtime.h"\r
-#include "npp.h"\r
-#include "NPP_staging.hpp"\r
+\r
+#ifndef CV_PI\r
+#define CV_PI   3.1415926535897932384626433832795f\r
+#endif\r
  \r
  #ifndef CV_PI_F\r
    #ifndef CV_PI\r
@@ -57,146 +61,158 @@
    #endif\r
  #endif\r
  \r
-namespace cv\r
+#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device { \r
+#define END_OPENCV_DEVICE_NAMESPACE   }}}\r
+#define OPENCV_DEVICE_NAMESPACE       ::cv::gpu::device\r
+#define OPENCV_DEVICE_NAMESPACE_      ::cv::gpu::device:: \r
+\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+typedef unsigned char uchar;\r
+typedef unsigned short ushort;\r
+typedef signed char schar;\r
+typedef unsigned int uint;\r
+\r
+template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)\r
  {\r
-    namespace gpu\r
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
+    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace cv { namespace gpu \r
+{\r
+    enum \r
      {\r
-        typedef unsigned char uchar;\r
-        typedef signed char schar;\r
-        typedef unsigned short ushort;\r
-        typedef unsigned int uint;       \r
+        BORDER_REFLECT101_GPU = 0,\r
+        BORDER_REPLICATE_GPU,\r
+        BORDER_CONSTANT_GPU,\r
+        BORDER_REFLECT_GPU,\r
+        BORDER_WRAP_GPU\r
+    };\r
+            \r
+    // Converts CPU border extrapolation mode into GPU internal analogue.\r
+    // Returns true if the GPU analogue exists, false otherwise.\r
+    bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);\r
+\r
+    static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }\r
  \r
-        enum \r
+    /*template<class T> static inline void uploadConstant(const char* name, const T& value) \r
+    { \r
+        cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); \r
+    }\r
+\r
+    template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) \r
+    {\r
+        cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); \r
+    }   */     \r
+\r
+    //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)\r
+    //{            \r
+    //    //!!!! const_cast is disabled!\r
+    //    //!!!! Please use constructor of 'class texture'  instead.\r
+    //\r
+    //    //textureReference* tex; \r
+    //    //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); \r
+    //    //tex->normalized = normalized;\r
+    //    //tex->filterMode = filterMode;\r
+    //    //tex->addressMode[0] = addrMode;\r
+    //    //tex->addressMode[1] = addrMode;\r
+    //    \r
+    //    const textureReference* tex; \r
+    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
+    //\r
+    //    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
+    //    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
+    //}\r
+\r
+    //static inline void unbindTexture(const char *name)\r
+    //{\r
+    //    const textureReference* tex; \r
+    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
+    //    cudaSafeCall( cudaUnbindTexture(tex) );\r
+    //}\r
+\r
+    \r
+\r
+    //class TextureBinder\r
+    //{\r
+    //public:\r
+    //    TextureBinder() : tex_(0) {}\r
+    //    template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)\r
+    //    {\r
+    //        bind(tex, img);\r
+    //    }\r
+    //    template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)\r
+    //    {\r
+    //        bind(tex_name, img);\r
+    //    }\r
+    //    ~TextureBinder() { unbind(); }\r
+    //\r
+    //    template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)\r
+    //    {\r
+    //        unbind();\r
+    //\r
+    //        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
+    //        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
+    //\r
+    //        tex_ = tex;\r
+    //    }\r
+    //    template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)\r
+    //    {\r
+    //        const textureReference* tex; \r
+    //        cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); \r
+    //        bind(tex, img);\r
+    //    }\r
+    //\r
+    //    void unbind()\r
+    //    {\r
+    //        if (tex_)\r
+    //        {\r
+    //            cudaUnbindTexture(tex_);\r
+    //            tex_ = 0;\r
+    //        }\r
+    //    }\r
+    //\r
+    //private:\r
+    //    const textureReference* tex_;\r
+    //};\r
+\r
+    class NppStreamHandler\r
+    {\r
+    public:\r
+        inline explicit NppStreamHandler(cudaStream_t newStream = 0)\r
          {\r
-            BORDER_REFLECT101_GPU = 0,\r
-            BORDER_REPLICATE_GPU,\r
-            BORDER_CONSTANT_GPU,\r
-            BORDER_REFLECT_GPU,\r
-            BORDER_WRAP_GPU\r
-        };\r
-                \r
-        // Converts CPU border extrapolation mode into GPU internal analogue.\r
-        // Returns true if the GPU analogue exists, false otherwise.\r
-        bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);\r
-\r
-        static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }\r
-\r
-        template<class T> static inline void uploadConstant(const char* name, const T& value) \r
-        { \r
-            cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); \r
+            oldStream = nppGetStream();\r
+            nppSetStream(newStream);\r
          }\r
  \r
-        template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) \r
+        inline ~NppStreamHandler()\r
          {\r
-            cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); \r
-        }        \r
-\r
-        template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img/*, bool normalized = false,\r
-            enum cudaTextureFilterMode filterMode = cudaFilterModePoint, enum cudaTextureAddressMode addrMode = cudaAddressModeClamp*/)\r
-        {            \r
-            //!!!! const_cast is disabled!\r
-            //!!!! Please use constructor of 'class texture'  instead.\r
-\r
-            //textureReference* tex; \r
-            //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); \r
-            //tex->normalized = normalized;\r
-            //tex->filterMode = filterMode;\r
-            //tex->addressMode[0] = addrMode;\r
-            //tex->addressMode[1] = addrMode;\r
-            \r
-            const textureReference* tex; \r
-            cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
-\r
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
-            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
+            nppSetStream(oldStream);\r
          }\r
  \r
-        static inline void unbindTexture(const char *name)\r
+    private:\r
+        cudaStream_t oldStream;\r
+    };\r
+\r
+    class NppStStreamHandler\r
+    {\r
+    public:\r
+        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)\r
          {\r
-            const textureReference* tex; \r
-            cudaSafeCall( cudaGetTextureReference(&tex, name) ); \r
-            cudaSafeCall( cudaUnbindTexture(tex) );\r
+            oldStream = nppStSetActiveCUDAstream(newStream);\r
          }\r
  \r
-        class TextureBinder\r
-        {\r
-        public:\r
-            TextureBinder() : tex_(0) {}\r
-            template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)\r
-            {\r
-                bind(tex, img);\r
-            }\r
-            template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)\r
-            {\r
-                bind(tex_name, img);\r
-            }\r
-            ~TextureBinder() { unbind(); }\r
-\r
-            template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)\r
-            {\r
-                unbind();\r
-\r
-                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\r
-                cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );\r
-\r
-                tex_ = tex;\r
-            }\r
-            template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)\r
-            {\r
-                const textureReference* tex; \r
-                cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); \r
-                bind(tex, img);\r
-            }\r
-\r
-            void unbind()\r
-            {\r
-                if (tex_)\r
-                {\r
-                    cudaUnbindTexture(tex_);\r
-                    tex_ = 0;\r
-                }\r
-            }\r
-\r
-        private:\r
-            const textureReference* tex_;\r
-        };\r
-\r
-        class NppStreamHandler\r
-        {\r
-        public:\r
-            inline explicit NppStreamHandler(cudaStream_t newStream = 0)\r
-            {\r
-                oldStream = nppGetStream();\r
-                nppSetStream(newStream);\r
-            }\r
-\r
-            inline ~NppStreamHandler()\r
-            {\r
-                nppSetStream(oldStream);\r
-            }\r
-\r
-        private:\r
-            cudaStream_t oldStream;\r
-        };\r
-\r
-        class NppStStreamHandler\r
+        inline ~NppStStreamHandler()\r
          {\r
-        public:\r
-            inline explicit NppStStreamHandler(cudaStream_t newStream = 0)\r
-            {\r
-                oldStream = nppStSetActiveCUDAstream(newStream);\r
-            }\r
-\r
-            inline ~NppStStreamHandler()\r
-            {\r
-                nppStSetActiveCUDAstream(oldStream);\r
-            }\r
-\r
-        private:\r
-            cudaStream_t oldStream;\r
-        };\r
-    }\r
-}\r
+            nppStSetActiveCUDAstream(oldStream);\r
+        }\r
  \r
+    private:\r
+        cudaStream_t oldStream;\r
+    };\r
+}}\r
  \r
  #endif /* __OPENCV_internal_shared_HPP__ */\r
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu

index e954a26..1fa571a 100644 (file)
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -43,10 +43,9 @@
  #include "internal_shared.hpp"\r
  #include "opencv2/gpu/device/vec_math.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace imgproc {\r
+namespace match_template {\r
  \r
  __device__ __forceinline__ float sum(float v) { return v; }\r
  __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }\r
@@ -266,9 +265,9 @@ void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long
  \r
  __device__ float normAcc(float num, float denum)\r
  {\r
-    if (fabs(num) < denum)\r
+    if (::fabs(num) < denum)\r
          return num / denum;\r
-    if (fabs(num) < denum * 1.125f)\r
+    if (::fabs(num) < denum * 1.125f)\r
          return num > 0 ? 1 : -1;\r
      return 0;\r
  }\r
@@ -276,9 +275,9 @@ __device__ float normAcc(float num, float denum)
  \r
  __device__ float normAcc_SQDIFF(float num, float denum)\r
  {\r
-    if (fabs(num) < denum)\r
+    if (::fabs(num) < denum)\r
          return num / denum;\r
-    if (fabs(num) < denum * 1.125f)\r
+    if (::fabs(num) < denum * 1.125f)\r
          return num > 0 ? 1 : -1;\r
      return 1;\r
  }\r
@@ -906,4 +905,7 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cu
      if (stream == 0)\r
          cudaSafeCall( cudaDeviceSynchronize() );\r
  }\r
-}}}\r
+\r
+} //namespace match_template\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu

index 0a644d4..3b427fc 100644 (file)
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -42,178 +42,174 @@
  \r
  #include "internal_shared.hpp"\r
  \r
-using namespace cv::gpu;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-#ifndef CV_PI\r
-#define CV_PI   3.1415926535897932384626433832795f\r
-#endif\r
+namespace mathfunc {\r
  \r
  //////////////////////////////////////////////////////////////////////////////////////\r
  // Cart <-> Polar\r
  \r
-namespace cv { namespace gpu { namespace mathfunc\r
+struct Nothing\r
  {\r
-    struct Nothing\r
+    static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)\r
      {\r
-        static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)\r
-        {\r
-        }\r
-    };\r
-    struct Magnitude\r
-    {\r
-        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
-        {\r
-            dst[y * dst_step + x] = sqrtf(x_data * x_data + y_data * y_data);\r
-        }\r
-    };\r
-    struct MagnitudeSqr\r
+    }\r
+};\r
+struct Magnitude\r
+{\r
+    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
      {\r
-        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
-        {\r
-            dst[y * dst_step + x] = x_data * x_data + y_data * y_data;\r
-        }\r
-    };\r
-    struct Atan2\r
+        dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);\r
+    }\r
+};\r
+struct MagnitudeSqr\r
+{\r
+    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)\r
      {\r
-        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)\r
-        {\r
-            float angle = atan2f(y_data, x_data);\r
-            angle += (angle < 0) * 2.0 * CV_PI;\r
-            dst[y * dst_step + x] = scale * angle;\r
-        }\r
-    };\r
-    template <typename Mag, typename Angle>\r
-    __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, \r
-                                float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)\r
+        dst[y * dst_step + x] = x_data * x_data + y_data * y_data;\r
+    }\r
+};\r
+struct Atan2\r
+{\r
+    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)\r
      {\r
-               const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-               const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+        float angle = ::atan2f(y_data, x_data);\r
+        angle += (angle < 0) * 2.0 * CV_PI;\r
+        dst[y * dst_step + x] = scale * angle;\r
+    }\r
+};\r
+template <typename Mag, typename Angle>\r
+__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, \r
+                            float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)\r
+{\r
+       const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+       const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        if (x < width && y < height)\r
-        {\r
-            float x_data = xptr[y * x_step + x];\r
-            float y_data = yptr[y * y_step + x];\r
+    if (x < width && y < height)\r
+    {\r
+        float x_data = xptr[y * x_step + x];\r
+        float y_data = yptr[y * y_step + x];\r
  \r
-            Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);\r
-            Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);\r
-        }\r
+        Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);\r
+        Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);\r
      }\r
+}\r
  \r
-    struct NonEmptyMag\r
-    {\r
-        static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)\r
-        {\r
-            return mag[y * mag_step + x];\r
-        }\r
-    };\r
-    struct EmptyMag\r
+struct NonEmptyMag\r
+{\r
+    static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)\r
      {\r
-        static __device__ __forceinline__ float get(const float*, size_t, int, int)\r
-        {\r
-            return 1.0f;\r
-        }\r
-    };\r
-    template <typename Mag>\r
-    __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,\r
-        float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)\r
+        return mag[y * mag_step + x];\r
+    }\r
+};\r
+struct EmptyMag\r
+{\r
+    static __device__ __forceinline__ float get(const float*, size_t, int, int)\r
      {\r
-               const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-               const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+        return 1.0f;\r
+    }\r
+};\r
+template <typename Mag>\r
+__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,\r
+    float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)\r
+{\r
+       const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+       const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        if (x < width && y < height)\r
-        {\r
-            float mag_data = Mag::get(mag, mag_step, x, y);\r
-            float angle_data = angle[y * angle_step + x];\r
-            float sin_a, cos_a;\r
+    if (x < width && y < height)\r
+    {\r
+        float mag_data = Mag::get(mag, mag_step, x, y);\r
+        float angle_data = angle[y * angle_step + x];\r
+        float sin_a, cos_a;\r
  \r
-            sincosf(scale * angle_data, &sin_a, &cos_a);\r
+        ::sincosf(scale * angle_data, &sin_a, &cos_a);\r
  \r
-            xptr[y * x_step + x] = mag_data * cos_a;\r
-            yptr[y * y_step + x] = mag_data * sin_a;\r
-        }\r
+        xptr[y * x_step + x] = mag_data * cos_a;\r
+        yptr[y * y_step + x] = mag_data * sin_a;\r
      }\r
+}\r
  \r
-    template <typename Mag, typename Angle>\r
-    void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <typename Mag, typename Angle>\r
+void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(x.cols, threads.x);\r
-        grid.y = divUp(x.rows, threads.y);\r
-        \r
-        const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;\r
+    grid.x = divUp(x.cols, threads.x);\r
+    grid.y = divUp(x.rows, threads.y);\r
+    \r
+    const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;\r
  \r
-        cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(\r
-            x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), \r
-            mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(\r
+        x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), \r
+        mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)\r
+void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);\r
+    static const caller_t callers[2][2][2] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream);\r
-        static const caller_t callers[2][2][2] = \r
          {\r
              {\r
-                {\r
-                    cartToPolar_caller<Magnitude, Atan2>,\r
-                    cartToPolar_caller<Magnitude, Nothing>\r
-                },\r
-                {\r
-                    cartToPolar_caller<MagnitudeSqr, Atan2>,\r
-                    cartToPolar_caller<MagnitudeSqr, Nothing>,\r
-                }\r
+                cartToPolar_caller<Magnitude, Atan2>,\r
+                cartToPolar_caller<Magnitude, Nothing>\r
              },\r
              {\r
-                {\r
-                    cartToPolar_caller<Nothing, Atan2>,\r
-                    cartToPolar_caller<Nothing, Nothing>\r
-                },\r
-                {\r
-                    cartToPolar_caller<Nothing, Atan2>,\r
-                    cartToPolar_caller<Nothing, Nothing>,\r
-                }\r
+                cartToPolar_caller<MagnitudeSqr, Atan2>,\r
+                cartToPolar_caller<MagnitudeSqr, Nothing>,\r
              }\r
-        };\r
+        },\r
+        {\r
+            {\r
+                cartToPolar_caller<Nothing, Atan2>,\r
+                cartToPolar_caller<Nothing, Nothing>\r
+            },\r
+            {\r
+                cartToPolar_caller<Nothing, Atan2>,\r
+                cartToPolar_caller<Nothing, Nothing>,\r
+            }\r
+        }\r
+    };\r
  \r
-        callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);\r
-    }\r
+    callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);\r
+}\r
  \r
-    template <typename Mag>\r
-    void polarToCart_caller(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <typename Mag>\r
+void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(mag.cols, threads.x);\r
-        grid.y = divUp(mag.rows, threads.y);\r
-        \r
-        const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;\r
+    grid.x = divUp(mag.cols, threads.x);\r
+    grid.y = divUp(mag.rows, threads.y);\r
+    \r
+    const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;\r
  \r
-        polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), \r
-            angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), \r
+        angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)\r
+void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);\r
+    static const caller_t callers[2] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);\r
-        static const caller_t callers[2] = \r
-        {\r
-            polarToCart_caller<NonEmptyMag>,\r
-            polarToCart_caller<EmptyMag>\r
-        };\r
-\r
-        callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);\r
-    }\r
-}}}\r
+        polarToCart_caller<NonEmptyMag>,\r
+        polarToCart_caller<EmptyMag>\r
+    };\r
  \r
+    callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);\r
+}\r
  \r
+} // namespace mathfunc\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu

index b2def00..df200ed 100644 (file)
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -45,302 +45,304 @@
  #include "opencv2/gpu/device/transform.hpp"\r
  #include "opencv2/gpu/device/functional.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device {\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T> struct shift_and_sizeof;\r
-    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };\r
-    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };\r
-    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };\r
-    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };\r
-    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };\r
-    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };\r
-    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };\r
+template <typename T> struct shift_and_sizeof;\r
+template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };\r
+template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };\r
+template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };\r
+template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };\r
+template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };\r
+template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };\r
+template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };\r
  \r
  ///////////////////////////////////////////////////////////////////////////\r
  ////////////////////////////////// CopyTo /////////////////////////////////\r
  ///////////////////////////////////////////////////////////////////////////\r
  \r
-    template<typename T>\r
-    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)\r
-    {\r
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        if ((x < cols * channels ) && (y < rows))\r
-            if (mask[y * step_mask + x / channels] != 0)\r
-            {\r
-                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;\r
-                mat_dst[idx] = mat_src[idx];\r
-            }\r
-    }\r
-    typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);\r
+template<typename T>\r
+__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)\r
+{\r
+    size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-    template<typename T>\r
-    void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
-    {\r
-        dim3 threadsPerBlock(16,16, 1);\r
-        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);\r
+    if ((x < cols * channels ) && (y < rows))\r
+        if (mask[y * step_mask + x / channels] != 0)\r
+        {\r
+            size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;\r
+            mat_dst[idx] = mat_src[idx];\r
+        }\r
+}\r
  \r
-        copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>\r
-                ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);\r
-        cudaSafeCall( cudaGetLastError() );\r
+template<typename T>\r
+void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
+{\r
+    dim3 threadsPerBlock(16,16, 1);\r
+    dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall ( cudaDeviceSynchronize() );\r
-    }\r
+    copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>\r
+            ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall ( cudaDeviceSynchronize() );\r
+}\r
+\r
+void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
+{\r
+    typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);\r
  \r
-    void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)\r
+    static CopyToFunc tab[8] =\r
      {\r
-        static CopyToFunc tab[8] =\r
-        {\r
-            copy_to_with_mask_run<unsigned char>,\r
-            copy_to_with_mask_run<signed char>,\r
-            copy_to_with_mask_run<unsigned short>,\r
-            copy_to_with_mask_run<short>,\r
-            copy_to_with_mask_run<int>,\r
-            copy_to_with_mask_run<float>,\r
-            copy_to_with_mask_run<double>,\r
-            0\r
-        };\r
+        copy_to_with_mask_run<unsigned char>,\r
+        copy_to_with_mask_run<signed char>,\r
+        copy_to_with_mask_run<unsigned short>,\r
+        copy_to_with_mask_run<short>,\r
+        copy_to_with_mask_run<int>,\r
+        copy_to_with_mask_run<float>,\r
+        copy_to_with_mask_run<double>,\r
+        0\r
+    };\r
  \r
-        CopyToFunc func = tab[depth];\r
+    CopyToFunc func = tab[depth];\r
  \r
-        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);\r
+    if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);\r
  \r
-        func(mat_src, mat_dst, mask, channels, stream);\r
-    }\r
+    func(mat_src, mat_dst, mask, channels, stream);\r
+}\r
  \r
  ///////////////////////////////////////////////////////////////////////////\r
  ////////////////////////////////// SetTo //////////////////////////////////\r
  ///////////////////////////////////////////////////////////////////////////\r
  \r
-    __constant__ uchar scalar_8u[4];\r
-    __constant__ schar scalar_8s[4];\r
-    __constant__ ushort scalar_16u[4];\r
-    __constant__ short scalar_16s[4];\r
-    __constant__ int scalar_32s[4];\r
-    __constant__ float scalar_32f[4]; \r
-    __constant__ double scalar_64f[4];\r
-\r
-    template <typename T> __device__ __forceinline__ T readScalar(int i);\r
-    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}\r
-    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}\r
-    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}\r
-    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}\r
-    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}\r
-    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}\r
-    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}\r
-\r
-    void writeScalar(const uchar* vals)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );\r
-    }\r
-    void writeScalar(const schar* vals)\r
+__constant__ uchar scalar_8u[4];\r
+__constant__ schar scalar_8s[4];\r
+__constant__ ushort scalar_16u[4];\r
+__constant__ short scalar_16s[4];\r
+__constant__ int scalar_32s[4];\r
+__constant__ float scalar_32f[4]; \r
+__constant__ double scalar_64f[4];\r
+\r
+template <typename T> __device__ __forceinline__ T readScalar(int i);\r
+template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}\r
+template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}\r
+template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}\r
+template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}\r
+template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}\r
+template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}\r
+template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}\r
+\r
+void writeScalar(const uchar* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );\r
+}\r
+void writeScalar(const schar* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );\r
+}\r
+void writeScalar(const ushort* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );\r
+}\r
+void writeScalar(const short* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );\r
+}\r
+void writeScalar(const int* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );\r
+}\r
+void writeScalar(const float* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );\r
+}\r
+void writeScalar(const double* vals)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );\r
+}\r
+\r
+template<typename T>\r
+__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)\r
+{\r
+    size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    if ((x < cols * channels ) && (y < rows))\r
      {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );\r
-    }\r
-    void writeScalar(const ushort* vals)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );\r
-    }\r
-    void writeScalar(const short* vals)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );\r
-    }\r
-    void writeScalar(const int* vals)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );\r
-    }\r
-    void writeScalar(const float* vals)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );\r
-    }\r
-    void writeScalar(const double* vals)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );\r
+        size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
+        mat[idx] = readScalar<T>(x % channels);\r
      }\r
+}\r
  \r
-    template<typename T>\r
-    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)\r
-    {\r
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template<typename T>\r
+__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)\r
+{\r
+    size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if ((x < cols * channels ) && (y < rows))\r
+    if ((x < cols * channels ) && (y < rows))\r
+        if (mask[y * step_mask + x / channels] != 0)\r
          {\r
              size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
              mat[idx] = readScalar<T>(x % channels);\r
          }\r
-    }\r
-\r
-    template<typename T>\r
-    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)\r
-    {\r
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        if ((x < cols * channels ) && (y < rows))\r
-            if (mask[y * step_mask + x / channels] != 0)\r
-            {\r
-                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;\r
-                mat[idx] = readScalar<T>(x % channels);\r
-            }\r
-    }\r
-    template <typename T>\r
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)\r
-    {\r
-        writeScalar(scalar);\r
-\r
-        dim3 threadsPerBlock(32, 8, 1);\r
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
-\r
-        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall ( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-\r
-    template <typename T>\r
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)\r
-    {\r
-        writeScalar(scalar);\r
-\r
-        dim3 threadsPerBlock(32, 8, 1);\r
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
-\r
-        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall ( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);\r
-    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);\r
+}\r
+template <typename T>\r
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)\r
+{\r
+    writeScalar(scalar);\r
+\r
+    dim3 threadsPerBlock(32, 8, 1);\r
+    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
+\r
+    set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall ( cudaDeviceSynchronize() );\r
+}\r
+\r
+template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)\r
+{\r
+    writeScalar(scalar);\r
+\r
+    dim3 threadsPerBlock(32, 8, 1);\r
+    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);\r
+\r
+    set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall ( cudaDeviceSynchronize() );\r
+}\r
+\r
+template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);\r
+template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);\r
+template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);\r
+template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);\r
+template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);\r
+template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);\r
+template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);\r
  \r
  ///////////////////////////////////////////////////////////////////////////\r
  //////////////////////////////// ConvertTo ////////////////////////////////\r
  ///////////////////////////////////////////////////////////////////////////\r
  \r
-    template <typename T, typename D> struct Convertor : unary_function<T, D>\r
+template <typename T, typename D> struct Convertor : unary_function<T, D>\r
+{\r
+    Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}\r
+\r
+    __device__ __forceinline__ D operator()(const T& src) const\r
      {\r
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}\r
+        return saturate_cast<D>(alpha * src + beta);\r
+    }\r
  \r
-        __device__ __forceinline__ D operator()(const T& src) const\r
-        {\r
-            return saturate_cast<D>(alpha * src + beta);\r
-        }\r
+    const double alpha, beta;\r
+};\r
  \r
-        const double alpha, beta;\r
+namespace detail\r
+{\r
+    template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>\r
+    {\r
      };\r
-\r
-    namespace detail\r
+    template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>\r
      {\r
-        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>\r
-        {\r
-        };\r
-        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_shift = 8 };\r
-        };\r
-        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_shift = 4 };\r
-        };\r
-        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_block_dim_y = 8 };\r
-            enum { smart_shift = 4 };\r
-        };\r
-\r
-        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_shift = 4 };\r
-        };\r
-        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_shift = 2 };\r
-        };\r
-\r
-        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_block_dim_y = 8 };\r
-            enum { smart_shift = 4 };\r
-        };\r
-        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>\r
-        {\r
-            enum { smart_block_dim_y = 8 };\r
-            enum { smart_shift = 2 };\r
-        };\r
+        enum { smart_shift = 8 };\r
+    };\r
+    template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>\r
+    {\r
+        enum { smart_shift = 4 };\r
+    };\r
+    template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>\r
+    {\r
+        enum { smart_block_dim_y = 8 };\r
+        enum { smart_shift = 4 };\r
+    };\r
  \r
-        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>\r
-        {\r
-        };\r
-    }\r
+    template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>\r
+    {\r
+        enum { smart_shift = 4 };\r
+    };\r
+    template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>\r
+    {\r
+        enum { smart_shift = 2 };\r
+    };\r
  \r
-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >\r
+    template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>\r
      {\r
+        enum { smart_block_dim_y = 8 };\r
+        enum { smart_shift = 4 };\r
      };\r
-        \r
-    template<typename T, typename D>\r
-    void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)\r
+    template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>\r
      {\r
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
-        Convertor<T, D> op(alpha, beta);\r
-        transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);\r
-    }\r
+        enum { smart_block_dim_y = 8 };\r
+        enum { smart_shift = 2 };\r
+    };\r
  \r
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, \r
-        cudaStream_t stream = 0)\r
+    template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>\r
      {\r
-        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, \r
-            cudaStream_t stream);\r
+    };\r
+}\r
+\r
+template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >\r
+{\r
+};\r
+    \r
+template<typename T, typename D>\r
+void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)\r
+{\r
+    cudaSafeCall( cudaSetDoubleForDevice(&alpha) );\r
+    cudaSafeCall( cudaSetDoubleForDevice(&beta) );\r
+    Convertor<T, D> op(alpha, beta);\r
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);\r
+}\r
+\r
+void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, \r
+    cudaStream_t stream = 0)\r
+{\r
+    typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, \r
+        cudaStream_t stream);\r
+\r
+    static const caller_t tab[8][8] =\r
+    {\r
+        {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,\r
+        cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},\r
  \r
-        static const caller_t tab[8][8] =\r
-        {\r
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,\r
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},\r
+        {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,\r
+        cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},\r
  \r
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,\r
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},\r
+        {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,\r
+        cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},\r
  \r
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,\r
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},\r
+        {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,\r
+        cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},\r
  \r
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,\r
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},\r
+        {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,\r
+        cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},\r
  \r
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,\r
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},\r
+        {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,\r
+        cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},\r
  \r
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,\r
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},\r
+        {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,\r
+        cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},\r
  \r
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,\r
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},\r
+        {0,0,0,0,0,0,0,0}\r
+    };\r
  \r
-            {0,0,0,0,0,0,0,0}\r
-        };\r
+    caller_t func = tab[sdepth][ddepth];\r
+    if (!func)\r
+        cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);\r
  \r
-        caller_t func = tab[sdepth][ddepth];\r
-        if (!func)\r
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);\r
+    func(src, dst, alpha, beta, stream);\r
+}\r
  \r
-        func(src, dst, alpha, beta, stream);\r
-    }\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu

index c0ca6c8..618e94a 100644 (file)
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -40,79 +40,73 @@
  //\r
  //M*/\r
  \r
+#include "internal_shared.hpp"\r
  #include "opencv2/gpu/device/limits.hpp"\r
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  #include "opencv2/gpu/device/vec_math.hpp"\r
  #include "opencv2/gpu/device/transform.hpp"\r
-#include "internal_shared.hpp"\r
-\r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
  \r
-namespace cv { namespace gpu { namespace mathfunc\r
-{\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    // Performs reduction in shared memory\r
-    template <int size, typename T>\r
-    __device__ void sumInSmem(volatile T* data, const uint tid)\r
-    {\r
-        T sum = data[tid];\r
+namespace matrix_reductions {\r
  \r
-        if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }\r
-        if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }\r
-        if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }\r
-\r
-        if (tid < 32)\r
-        {\r
-            if (size >= 64) data[tid] = sum = sum + data[tid + 32];\r
-            if (size >= 32) data[tid] = sum = sum + data[tid + 16];\r
-            if (size >= 16) data[tid] = sum = sum + data[tid + 8];\r
-            if (size >= 8) data[tid] = sum = sum + data[tid + 4];\r
-            if (size >= 4) data[tid] = sum = sum + data[tid + 2];\r
-            if (size >= 2) data[tid] = sum = sum + data[tid + 1];\r
-        }\r
-    }\r
+// Performs reduction in shared memory\r
+template <int size, typename T>\r
+__device__ void sumInSmem(volatile T* data, const uint tid)\r
+{\r
+    T sum = data[tid];\r
  \r
+    if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }\r
+    if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }\r
+    if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }\r
  \r
-    struct Mask8U\r
+    if (tid < 32)\r
      {\r
-        explicit Mask8U(PtrStepb mask): mask(mask) {}\r
-\r
-        __device__ __forceinline__ bool operator()(int y, int x) const \r
-        { \r
-            return mask.ptr(y)[x]; \r
-        }\r
-\r
-        PtrStepb mask;\r
-    };\r
+        if (size >= 64) data[tid] = sum = sum + data[tid + 32];\r
+        if (size >= 32) data[tid] = sum = sum + data[tid + 16];\r
+        if (size >= 16) data[tid] = sum = sum + data[tid + 8];\r
+        if (size >= 8) data[tid] = sum = sum + data[tid + 4];\r
+        if (size >= 4) data[tid] = sum = sum + data[tid + 2];\r
+        if (size >= 2) data[tid] = sum = sum + data[tid + 1];\r
+    }\r
+}\r
  \r
+struct Mask8U\r
+{\r
+    explicit Mask8U(PtrStepb mask): mask(mask) {}\r
  \r
-    struct MaskTrue \r
+    __device__ __forceinline__ bool operator()(int y, int x) const \r
      { \r
-        __device__ __forceinline__ bool operator()(int y, int x) const \r
-        { \r
-            return true; \r
-        } \r
-    };\r
-\r
-    //////////////////////////////////////////////////////////////////////////////\r
-    // Min max\r
-\r
-    // To avoid shared bank conflicts we convert each value into value of \r
-    // appropriate type (32 bits minimum)\r
-    template <typename T> struct MinMaxTypeTraits {};\r
-    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };\r
-    template <> struct MinMaxTypeTraits<char> { typedef int best_type; };\r
-    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };\r
-    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };\r
-    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };\r
-    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };\r
-    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };\r
-\r
+        return mask.ptr(y)[x]; \r
+    }\r
  \r
-    namespace minmax \r
-    {\r
+    PtrStepb mask;\r
+};\r
  \r
+struct MaskTrue \r
+{ \r
+    __device__ __forceinline__ bool operator()(int y, int x) const \r
+    { \r
+        return true; \r
+    } \r
+};\r
+\r
+//////////////////////////////////////////////////////////////////////////////\r
+// Min max\r
+\r
+// To avoid shared bank conflicts we convert each value into value of \r
+// appropriate type (32 bits minimum)\r
+template <typename T> struct MinMaxTypeTraits {};\r
+template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };\r
+template <> struct MinMaxTypeTraits<char> { typedef int best_type; };\r
+template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };\r
+template <> struct MinMaxTypeTraits<short> { typedef int best_type; };\r
+template <> struct MinMaxTypeTraits<int> { typedef int best_type; };\r
+template <> struct MinMaxTypeTraits<float> { typedef float best_type; };\r
+template <> struct MinMaxTypeTraits<double> { typedef double best_type; };\r
+\r
+namespace minmax \r
+{\r
      __constant__ int ctwidth;\r
      __constant__ int ctheight;\r
  \r
@@ -126,8 +120,8 @@ namespace cv { namespace gpu { namespace mathfunc
      {\r
          threads = dim3(32, 8);\r
          grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
-        grid.x = min(grid.x, threads.x);\r
-        grid.y = min(grid.y, threads.y);\r
+        grid.x = std::min(grid.x, threads.x);\r
+        grid.y = std::min(grid.y, threads.y);\r
      }\r
  \r
  \r
@@ -155,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)\r
      {\r
-        minval[tid] = min(minval[tid], minval[tid + offset]);\r
-        maxval[tid] = max(maxval[tid], maxval[tid + offset]);\r
+        minval[tid] = ::min(minval[tid], minval[tid + offset]);\r
+        maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);\r
      }\r
  \r
  \r
@@ -192,8 +186,8 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
          T mymin = numeric_limits<T>::max();\r
          T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();\r
-        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
-        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
+        uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
+        uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
          for (uint y = y0; y < y_end; y += blockDim.y)\r
          {\r
              const T* src_row = (const T*)src.ptr(y);\r
@@ -202,8 +196,8 @@ namespace cv { namespace gpu { namespace mathfunc
                  T val = src_row[x];\r
                  if (mask(y, x)) \r
                  { \r
-                    mymin = min(mymin, val); \r
-                    mymax = max(mymax, val); \r
+                    mymin = ::min(mymin, val); \r
+                    mymax = ::max(mymax, val); \r
                  }\r
              }\r
          }\r
@@ -220,7 +214,7 @@ namespace cv { namespace gpu { namespace mathfunc
              maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
          }\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
                 __shared__ bool is_last;\r
  \r
                 if (tid == 0)\r
@@ -237,7 +231,7 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
                 if (is_last)\r
                 {\r
-            uint idx = min(tid, gridDim.x * gridDim.y - 1);\r
+            uint idx = ::min(tid, gridDim.x * gridDim.y - 1);\r
  \r
              sminval[tid] = minval[idx];\r
              smaxval[tid] = maxval[idx];\r
@@ -332,7 +326,7 @@ namespace cv { namespace gpu { namespace mathfunc
          __shared__ best_type smaxval[nthreads];\r
          \r
          uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-        uint idx = min(tid, size - 1);\r
+        uint idx = ::min(tid, size - 1);\r
  \r
          sminval[tid] = minval[idx];\r
          smaxval[tid] = maxval[idx];\r
@@ -410,14 +404,13 @@ namespace cv { namespace gpu { namespace mathfunc
      template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);\r
      template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);\r
      template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);\r
-\r
-    } // namespace minmax\r
+} // namespace minmax\r
  \r
  ///////////////////////////////////////////////////////////////////////////////\r
  // minMaxLoc\r
  \r
-    namespace minmaxloc {\r
-\r
+namespace minmaxloc \r
+{\r
      __constant__ int ctwidth;\r
      __constant__ int ctheight;\r
  \r
@@ -431,8 +424,8 @@ namespace cv { namespace gpu { namespace mathfunc
      {\r
          threads = dim3(32, 8);\r
          grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
-        grid.x = min(grid.x, threads.x);\r
-        grid.y = min(grid.y, threads.y);\r
+        grid.x = std::min(grid.x, threads.x);\r
+        grid.y = std::min(grid.y, threads.y);\r
      }\r
  \r
  \r
@@ -513,12 +506,11 @@ namespace cv { namespace gpu { namespace mathfunc
          uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
  \r
          T mymin = numeric_limits<T>::max();\r
-        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : \r
-                                                     numeric_limits<T>::min(); \r
+        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); \r
          uint myminloc = 0;\r
          uint mymaxloc = 0;\r
-        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
-        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
+        uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);\r
+        uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);\r
  \r
          for (uint y = y0; y < y_end; y += blockDim.y)\r
          {\r
@@ -542,7 +534,7 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
          findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
                 __shared__ bool is_last;\r
  \r
                 if (tid == 0)\r
@@ -561,7 +553,7 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
                 if (is_last)\r
                 {\r
-            uint idx = min(tid, gridDim.x * gridDim.y - 1);\r
+            uint idx = ::min(tid, gridDim.x * gridDim.y - 1);\r
  \r
              sminval[tid] = minval[idx];\r
              smaxval[tid] = maxval[idx];\r
@@ -685,7 +677,7 @@ namespace cv { namespace gpu { namespace mathfunc
          __shared__ uint smaxloc[nthreads];\r
  \r
          uint tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-        uint idx = min(tid, size - 1);\r
+        uint idx = ::min(tid, size - 1);\r
  \r
          sminval[tid] = minval[idx];\r
          smaxval[tid] = maxval[idx];\r
@@ -787,15 +779,13 @@ namespace cv { namespace gpu { namespace mathfunc
      template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
      template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
      template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
-\r
-    } // namespace minmaxloc\r
+} // namespace minmaxloc\r
  \r
  //////////////////////////////////////////////////////////////////////////////////////////////////////////\r
  // countNonZero\r
  \r
-    namespace countnonzero \r
-    {\r
-\r
+namespace countnonzero \r
+{\r
      __constant__ int ctwidth;\r
      __constant__ int ctheight;\r
  \r
@@ -805,8 +795,8 @@ namespace cv { namespace gpu { namespace mathfunc
      {\r
          threads = dim3(32, 8);\r
          grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));\r
-        grid.x = min(grid.x, threads.x);\r
-        grid.y = min(grid.y, threads.y);\r
+        grid.x = std::min(grid.x, threads.x);\r
+        grid.y = std::min(grid.y, threads.y);\r
      }\r
  \r
  \r
@@ -850,7 +840,7 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
          sumInSmem<nthreads, uint>(scount, tid);\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
                 __shared__ bool is_last;\r
  \r
                 if (tid == 0)\r
@@ -957,15 +947,14 @@ namespace cv { namespace gpu { namespace mathfunc
      template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);\r
      template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);\r
  \r
-    } // namespace countnonzero\r
+} // namespace countnonzero\r
  \r
  \r
-    //////////////////////////////////////////////////////////////////////////\r
-    // Sum\r
-\r
-    namespace sums \r
-    {\r
+//////////////////////////////////////////////////////////////////////////\r
+// Sum\r
  \r
+namespace sum\r
+{\r
      template <typename T> struct SumType {};\r
      template <> struct SumType<uchar> { typedef uint R; };\r
      template <> struct SumType<char> { typedef int R; };\r
@@ -979,7 +968,7 @@ namespace cv { namespace gpu { namespace mathfunc
      struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };\r
  \r
      template <typename R> \r
-    struct AbsOp { static __device__ __forceinline__ R call(R x) { return abs(x); } };\r
+    struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };\r
  \r
      template <>\r
      struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };\r
@@ -999,8 +988,8 @@ namespace cv { namespace gpu { namespace mathfunc
          threads = dim3(threads_x, threads_y);\r
          grid = dim3(divUp(cols, threads.x * threads.y), \r
                      divUp(rows, threads.y * threads.x));\r
-        grid.x = min(grid.x, threads.x);\r
-        grid.y = min(grid.y, threads.y);\r
+        grid.x = std::min(grid.x, threads.x);\r
+        grid.y = std::min(grid.y, threads.y);\r
      }\r
  \r
  \r
@@ -1044,7 +1033,7 @@ namespace cv { namespace gpu { namespace mathfunc
  \r
          sumInSmem<nthreads, R>(smem, tid);\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
          __shared__ bool is_last;\r
  \r
          if (tid == 0)\r
@@ -1125,7 +1114,7 @@ namespace cv { namespace gpu { namespace mathfunc
          sumInSmem<nthreads, R>(smem, tid);\r
          sumInSmem<nthreads, R>(smem + nthreads, tid);\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
          __shared__ bool is_last;\r
  \r
          if (tid == 0)\r
@@ -1232,7 +1221,7 @@ namespace cv { namespace gpu { namespace mathfunc
          sumInSmem<nthreads, R>(smem + nthreads, tid);\r
          sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
          __shared__ bool is_last;\r
  \r
          if (tid == 0)\r
@@ -1349,7 +1338,7 @@ namespace cv { namespace gpu { namespace mathfunc
          sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);\r
          sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);\r
  \r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+#if __CUDA_ARCH__ >= 110\r
          __shared__ bool is_last;\r
  \r
          if (tid == 0)\r
@@ -1437,13 +1426,9 @@ namespace cv { namespace gpu { namespace mathfunc
          }\r
      }\r
  \r
-    } // namespace sum\r
-\r
-\r
      template <typename T>\r
      void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
      {\r
-        using namespace sums;\r
          typedef typename SumType<T>::R R;\r
  \r
          dim3 threads, grid;\r
@@ -1515,7 +1500,6 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
      {\r
-        using namespace sums;\r
          typedef typename SumType<T>::R R;\r
  \r
          dim3 threads, grid;\r
@@ -1565,7 +1549,6 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
      {\r
-        using namespace sums;\r
          typedef typename SumType<T>::R R;\r
  \r
          dim3 threads, grid;\r
@@ -1637,7 +1620,6 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
      {\r
-        using namespace sums;\r
          typedef typename SumType<T>::R R;\r
  \r
          dim3 threads, grid;\r
@@ -1687,7 +1669,6 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
      {\r
-        using namespace sums;\r
          typedef typename SumType<T>::R R;\r
  \r
          dim3 threads, grid;\r
@@ -1759,7 +1740,6 @@ namespace cv { namespace gpu { namespace mathfunc
      template <typename T>\r
      void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)\r
      {\r
-        using namespace sums;\r
          typedef typename SumType<T>::R R;\r
  \r
          dim3 threads, grid;\r
@@ -1804,301 +1784,305 @@ namespace cv { namespace gpu { namespace mathfunc
      template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);\r
      template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);\r
      template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);\r
+} // namespace sum\r
  \r
-    //////////////////////////////////////////////////////////////////////////////\r
-    // reduce\r
+//////////////////////////////////////////////////////////////////////////////\r
+// reduce\r
  \r
-    template <typename S> struct SumReductor\r
+template <typename S> struct SumReductor\r
+{\r
+    __device__ __forceinline__ S startValue() const\r
      {\r
-        __device__ __forceinline__ S startValue() const\r
-        {\r
-            return 0;\r
-        }\r
-\r
-        __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
-        {\r
-            return a + b;\r
-        }\r
+        return 0;\r
+    }\r
  \r
-        __device__ __forceinline__ S result(S r, double) const\r
-        {\r
-            return r;\r
-        }\r
-    };\r
+    __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
+    {\r
+        return a + b;\r
+    }\r
  \r
-    template <typename S> struct AvgReductor\r
+    __device__ __forceinline__ S result(S r, double) const\r
      {\r
-        __device__ __forceinline__ S startValue() const\r
-        {\r
-            return 0;\r
-        }\r
+        return r;\r
+    }\r
+};\r
  \r
-        __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
-        {\r
-            return a + b;\r
-        }\r
+template <typename S> struct AvgReductor\r
+{\r
+    __device__ __forceinline__ S startValue() const\r
+    {\r
+        return 0;\r
+    }\r
  \r
-        __device__ __forceinline__ double result(S r, double sz) const\r
-        {\r
-            return r / sz;\r
-        }\r
-    };\r
+    __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const\r
+    {\r
+        return a + b;\r
+    }\r
  \r
-    template <typename S> struct MinReductor\r
+    __device__ __forceinline__ double result(S r, double sz) const\r
      {\r
-        __device__ __forceinline__ S startValue() const\r
-        {\r
-            return numeric_limits<S>::max();\r
-        }\r
+        return r / sz;\r
+    }\r
+};\r
  \r
-        template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const\r
-        {\r
-            return saturate_cast<T>(::min(a, b));\r
-        }\r
-        __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
-        {\r
-            return ::fmin(a, b);\r
-        }\r
+template <typename S> struct MinReductor\r
+{\r
+    __device__ __forceinline__ S startValue() const\r
+    {\r
+        return numeric_limits<S>::max();\r
+    }\r
  \r
-        __device__ __forceinline__ S result(S r, double) const\r
-        {\r
-            return r;\r
-        }\r
-    };\r
+    template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const\r
+    {\r
+        return saturate_cast<T>(::min(a, b));\r
+    }\r
+    __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
+    {\r
+        return ::fmin(a, b);\r
+    }\r
  \r
-    template <typename S> struct MaxReductor\r
+    __device__ __forceinline__ S result(S r, double) const\r
      {\r
-        __device__ __forceinline__ S startValue() const\r
-        {\r
-            return numeric_limits<S>::min();\r
-        }\r
+        return r;\r
+    }\r
+};\r
  \r
-        template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const\r
-        {\r
-            return ::max(a, b);\r
-        }\r
-        __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
-        {\r
-            return ::fmax(a, b);\r
-        }\r
+template <typename S> struct MaxReductor\r
+{\r
+    __device__ __forceinline__ S startValue() const\r
+    {\r
+        return numeric_limits<S>::min();\r
+    }\r
  \r
-        __device__ __forceinline__ S result(S r, double) const\r
-        {\r
-            return r;\r
-        }\r
-    };\r
+    template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const\r
+    {\r
+        return ::max(a, b);\r
+    }\r
+    __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const\r
+    {\r
+        return ::fmax(a, b);\r
+    }\r
  \r
-    template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)\r
+    __device__ __forceinline__ S result(S r, double) const\r
      {\r
-        __shared__ S smem[16 * 16];\r
+        return r;\r
+    }\r
+};\r
  \r
-        const int x = blockIdx.x * 16 + threadIdx.x;\r
+template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)\r
+{\r
+    __shared__ S smem[16 * 16];\r
  \r
-        S myVal = op.startValue();\r
+    const int x = blockIdx.x * 16 + threadIdx.x;\r
  \r
-        if (x < src.cols)\r
-        {\r
-            for (int y = threadIdx.y; y < src.rows; y += 16)\r
-                myVal = op(myVal, src.ptr(y)[x]);\r
-        }        \r
+    S myVal = op.startValue();\r
  \r
-        smem[threadIdx.x * 16 + threadIdx.y] = myVal;\r
-        __syncthreads();\r
+    if (x < src.cols)\r
+    {\r
+        for (int y = threadIdx.y; y < src.rows; y += 16)\r
+            myVal = op(myVal, src.ptr(y)[x]);\r
+    }        \r
  \r
-        if (threadIdx.x < 8)\r
-        {\r
-            volatile S* srow = smem + threadIdx.y * 16;\r
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);\r
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);\r
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);\r
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);\r
-        }\r
-        __syncthreads();\r
+    smem[threadIdx.x * 16 + threadIdx.y] = myVal;\r
+    __syncthreads();\r
  \r
-        if (threadIdx.y == 0 && x < src.cols)\r
-            dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));\r
+    if (threadIdx.x < 8)\r
+    {\r
+        volatile S* srow = smem + threadIdx.y * 16;\r
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);\r
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);\r
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);\r
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);\r
      }\r
+    __syncthreads();\r
  \r
-    template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
-    {\r
-        const dim3 block(16, 16);\r
-        const dim3 grid(divUp(src.cols, block.x));\r
+    if (threadIdx.y == 0 && x < src.cols)\r
+        dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));\r
+}\r
  \r
-        Op<S> op;\r
-        reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
-        cudaSafeCall( cudaGetLastError() );\r
+template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
+{\r
+    const dim3 block(16, 16);\r
+    const dim3 grid(divUp(src.cols, block.x));\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+    Op<S> op;\r
+    reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+}\r
  \r
-    template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
+template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
+\r
+    static const caller_t callers[] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
+        reduceRows_caller<SumReductor, T, S, D>, \r
+        reduceRows_caller<AvgReductor, T, S, D>, \r
+        reduceRows_caller<MaxReductor, T, S, D>, \r
+        reduceRows_caller<MinReductor, T, S, D>\r
+    };\r
  \r
-        static const caller_t callers[] = \r
-        {\r
-            reduceRows_caller<SumReductor, T, S, D>, \r
-            reduceRows_caller<AvgReductor, T, S, D>, \r
-            reduceRows_caller<MaxReductor, T, S, D>, \r
-            reduceRows_caller<MinReductor, T, S, D>\r
-        };\r
+    callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
+}\r
  \r
-        callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
-    }\r
+template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  \r
  \r
-    template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  \r
+template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
  \r
-    template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
  \r
-    template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
  \r
-    template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
  \r
-    template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
  \r
  \r
+template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)\r
+{\r
+    __shared__ S smem[256 * cn];\r
  \r
-    template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)\r
-    {\r
-        __shared__ S smem[256 * cn];\r
+    const int y = blockIdx.x;\r
+\r
+    const T* src_row = src.ptr(y);\r
  \r
-        const int y = blockIdx.x;\r
+    S myVal[cn];\r
  \r
-        const T* src_row = src.ptr(y);\r
+    #pragma unroll\r
+    for (int c = 0; c < cn; ++c)\r
+        myVal[c] = op.startValue();\r
  \r
-        S myVal[cn];\r
+#if __CUDA_ARCH__ >= 200\r
  \r
+    // For cc >= 2.0 prefer L1 cache\r
+    for (int x = threadIdx.x; x < src.cols; x += 256)\r
+    {\r
          #pragma unroll\r
          for (int c = 0; c < cn; ++c)\r
-            myVal[c] = op.startValue();\r
+            myVal[c] = op(myVal[c], src_row[x * cn + c]);\r
+    }\r
  \r
-#if __CUDA_ARCH__ >= 200\r
+#else // __CUDA_ARCH__ >= 200\r
  \r
-        // For cc >= 2.0 prefer L1 cache\r
-        for (int x = threadIdx.x; x < src.cols; x += 256)\r
+    // For older arch use shared memory for cache\r
+    for (int x = 0; x < src.cols; x += 256)\r
+    {\r
+        #pragma unroll\r
+        for (int c = 0; c < cn; ++c)\r
          {\r
-            #pragma unroll\r
-            for (int c = 0; c < cn; ++c)\r
-                myVal[c] = op(myVal[c], src_row[x * cn + c]);\r
+            smem[c * 256 + threadIdx.x] = op.startValue();\r
+            const int load_x = x * cn + c * 256 + threadIdx.x;\r
+            if (load_x < src.cols * cn)\r
+                smem[c * 256 + threadIdx.x] = src_row[load_x];\r
          }\r
+        __syncthreads();\r
  \r
-#else // __CUDA_ARCH__ >= 200\r
+        #pragma unroll\r
+        for (int c = 0; c < cn; ++c)\r
+            myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);\r
+        __syncthreads();\r
+    }\r
  \r
-        // For older arch use shared memory for cache\r
-        for (int x = 0; x < src.cols; x += 256)\r
-        {\r
-            #pragma unroll\r
-            for (int c = 0; c < cn; ++c)\r
-            {\r
-                smem[c * 256 + threadIdx.x] = op.startValue();\r
-                const int load_x = x * cn + c * 256 + threadIdx.x;\r
-                if (load_x < src.cols * cn)\r
-                    smem[c * 256 + threadIdx.x] = src_row[load_x];\r
-            }\r
-            __syncthreads();\r
+#endif // __CUDA_ARCH__ >= 200\r
  \r
-            #pragma unroll\r
-            for (int c = 0; c < cn; ++c)\r
-                myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);\r
-            __syncthreads();\r
-        }\r
+    #pragma unroll\r
+    for (int c = 0; c < cn; ++c)\r
+        smem[c * 256 + threadIdx.x] = myVal[c];\r
+    __syncthreads();\r
  \r
-#endif // __CUDA_ARCH__ >= 200\r
+    if (threadIdx.x < 128)\r
+    {\r
+        #pragma unroll\r
+        for (int c = 0; c < cn; ++c)\r
+            smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);\r
+    }\r
+    __syncthreads();\r
  \r
+    if (threadIdx.x < 64)\r
+    {\r
          #pragma unroll\r
          for (int c = 0; c < cn; ++c)\r
-            smem[c * 256 + threadIdx.x] = myVal[c];\r
-        __syncthreads();\r
+            smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);\r
+    }\r
+    __syncthreads();\r
  \r
-        if (threadIdx.x < 128)\r
-        {\r
-            #pragma unroll\r
-            for (int c = 0; c < cn; ++c)\r
-                smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);\r
-        }\r
-        __syncthreads();\r
+    volatile S* sdata = smem;\r
  \r
-        if (threadIdx.x < 64)\r
+    if (threadIdx.x < 32)\r
+    {\r
+        #pragma unroll\r
+        for (int c = 0; c < cn; ++c)\r
          {\r
-            #pragma unroll\r
-            for (int c = 0; c < cn; ++c)\r
-                smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);\r
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);\r
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);\r
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);\r
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);\r
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);\r
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);\r
          }\r
-        __syncthreads();\r
+    }\r
+    __syncthreads();\r
  \r
-        volatile S* sdata = smem;\r
+    if (threadIdx.x < cn)\r
+        dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));\r
+}\r
  \r
-        if (threadIdx.x < 32)\r
-        {\r
-            #pragma unroll\r
-            for (int c = 0; c < cn; ++c)\r
-            {\r
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);\r
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);\r
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);\r
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);\r
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);\r
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);\r
-            }\r
-        }\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x < cn)\r
-            dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));\r
-    }\r
+template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
+{\r
+    const dim3 block(256);\r
+    const dim3 grid(src.rows);\r
  \r
-    template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)\r
-    {\r
-        const dim3 block(256);\r
-        const dim3 grid(src.rows);\r
+    Op<S> op;\r
+    reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        Op<S> op;\r
-        reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    }\r
+template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
  \r
-    template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)\r
+    static const caller_t callers[4][4] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);\r
+        {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},\r
+        {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},\r
+        {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},\r
+        {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},\r
+    };\r
  \r
-        static const caller_t callers[4][4] = \r
-        {\r
-            {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},\r
-            {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},\r
-            {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},\r
-            {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},\r
-        };\r
+    callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
+}\r
  \r
-        callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);\r
-    }\r
+template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+\r
+template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
+template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  \r
+template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
  \r
-    template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-    template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  \r
+template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  \r
+template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  \r
  \r
-    template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); \r
-    template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  \r
-    template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  \r
+template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
  \r
-    template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  \r
-    template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  \r
-    template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  \r
+template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
  \r
-    template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  \r
-    template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
+} // namespace mattrix_reductions\r
  \r
-    template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
- }}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/pyr_down.cu b/modules/gpu/src/cuda/pyr_down.cu

index 385df7a..7c5077e 100644 (file)
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -46,140 +46,142 @@
  #include "opencv2/gpu/device/vec_math.hpp"\r
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+namespace pyr_down {\r
+\r
+template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
  {\r
-    template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)\r
-    {\r
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y;\r
+\r
+    __shared__ value_type smem[256 + 4];\r
+\r
+    value_type sum;\r
+    \r
+    const int src_y = 2*y;\r
+\r
+    sum = VecTraits<value_type>::all(0);\r
+    \r
+    sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);\r
+    sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);\r
+    sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);\r
+    sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);\r
+    sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);\r
+\r
+    smem[2 + threadIdx.x] = sum;\r
  \r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y;\r
+    if (threadIdx.x < 2)\r
+    {\r
+        const int left_x = x - 2 + threadIdx.x;\r
  \r
-        __shared__ value_type smem[256 + 4];\r
+        sum = VecTraits<value_type>::all(0);\r
+    \r
+        sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);\r
+        sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);\r
+        sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);\r
+        sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);\r
+        sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);\r
+\r
+        smem[threadIdx.x] = sum;\r
+    }\r
  \r
-        value_type sum;\r
-        \r
-        const int src_y = 2*y;\r
+    if (threadIdx.x > 253)\r
+    {\r
+        const int right_x = x + threadIdx.x + 2;\r
  \r
          sum = VecTraits<value_type>::all(0);\r
-        \r
-        sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);\r
-        sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);\r
-        sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);\r
-        sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);\r
-        sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);\r
-\r
-        smem[2 + threadIdx.x] = sum;\r
-\r
-        if (threadIdx.x < 2)\r
-        {\r
-            const int left_x = x - 2 + threadIdx.x;\r
-\r
-            sum = VecTraits<value_type>::all(0);\r
-        \r
-            sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);\r
-            sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);\r
-            sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);\r
-            sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);\r
-            sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);\r
-\r
-            smem[threadIdx.x] = sum;\r
-        }\r
-\r
-        if (threadIdx.x > 253)\r
-        {\r
-            const int right_x = x + threadIdx.x + 2;\r
-\r
-            sum = VecTraits<value_type>::all(0);\r
-        \r
-            sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);\r
-            sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);\r
-            sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);\r
-            sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);\r
-            sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);\r
-\r
-            smem[4 + threadIdx.x] = sum;\r
-        }\r
-\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x < 128)\r
-        {\r
-            const int tid2 = threadIdx.x * 2;\r
-\r
-            sum = VecTraits<value_type>::all(0);\r
-\r
-            sum = sum + 0.0625f * smem[2 + tid2 - 2];\r
-            sum = sum + 0.25f   * smem[2 + tid2 - 1];\r
-            sum = sum + 0.375f  * smem[2 + tid2    ];\r
-            sum = sum + 0.25f   * smem[2 + tid2 + 1];\r
-            sum = sum + 0.0625f * smem[2 + tid2 + 2];\r
-\r
-            const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;\r
-\r
-            if (dst_x < dst_cols)\r
-                dst.ptr(y)[dst_x] = saturate_cast<T>(sum);\r
-        }\r
+    \r
+        sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);\r
+        sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);\r
+        sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);\r
+        sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);\r
+        sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);\r
+\r
+        smem[4 + threadIdx.x] = sum;\r
      }\r
  \r
-    template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+    __syncthreads();\r
+\r
+    if (threadIdx.x < 128)\r
      {\r
-        const dim3 block(256);\r
-        const dim3 grid(divUp(src.cols, block.x), dst.rows);\r
+        const int tid2 = threadIdx.x * 2;\r
+\r
+        sum = VecTraits<value_type>::all(0);\r
  \r
-        B<T> b(src.rows, src.cols);\r
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];\r
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];\r
+        sum = sum + 0.375f  * smem[2 + tid2    ];\r
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];\r
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];\r
  \r
-        pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);\r
-        cudaSafeCall( cudaGetLastError() );\r
+        const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        if (dst_x < dst_cols)\r
+            dst.ptr(y)[dst_x] = saturate_cast<T>(sum);\r
      }\r
+}\r
  \r
-    template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
+template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+{\r
+    const dim3 block(256);\r
+    const dim3 grid(divUp(src.cols, block.x), dst.rows);\r
+\r
+    B<T> b(src.rows, src.cols);\r
+\r
+    pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
+{\r
+    typedef typename TypeVec<T, cn>::vec_type type;\r
+\r
+    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
+\r
+    static const caller_t callers[] = \r
      {\r
-        typedef typename TypeVec<T, cn>::vec_type type;\r
+        pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>\r
+    };\r
  \r
-        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
+    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
+}\r
  \r
-        static const caller_t callers[] = \r
-        {\r
-            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>\r
-        };\r
+template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
-        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
-    }\r
+template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+} // namespace pyr_down\r
  \r
-    template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/pyr_up.cu b/modules/gpu/src/cuda/pyr_up.cu

index 3555b89..35ddb3a 100644 (file)
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -46,135 +46,137 @@
  #include "opencv2/gpu/device/vec_math.hpp"\r
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+namespace pyr_up {\r
+\r
+template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)\r
  {\r
-    template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)\r
-    {\r
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;\r
+\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+    __shared__ T smem1[10][10];\r
+    __shared__ value_type smem2[20][16];\r
  \r
-        __shared__ T smem1[10][10];\r
-        __shared__ value_type smem2[20][16];\r
+    value_type sum;\r
  \r
-        value_type sum;\r
+    if (threadIdx.x < 10 && threadIdx.y < 10)\r
+        smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);\r
  \r
-        if (threadIdx.x < 10 && threadIdx.y < 10)\r
-            smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);\r
+    __syncthreads();\r
  \r
-        __syncthreads();\r
+    const int tidx = threadIdx.x;\r
  \r
-        const int tidx = threadIdx.x;\r
+    sum = VecTraits<value_type>::all(0);\r
  \r
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];\r
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];\r
+    sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];\r
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];\r
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];\r
+\r
+    smem2[2 + threadIdx.y][tidx] = sum;\r
+\r
+    if (threadIdx.y < 2)\r
+    {\r
          sum = VecTraits<value_type>::all(0);\r
  \r
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];\r
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];\r
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];\r
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];\r
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];\r
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];\r
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];\r
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];\r
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];\r
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];\r
  \r
-        smem2[2 + threadIdx.y][tidx] = sum;\r
+        smem2[threadIdx.y][tidx] = sum;\r
+    }\r
  \r
-        if (threadIdx.y < 2)\r
-        {\r
-            sum = VecTraits<value_type>::all(0);\r
+    if (threadIdx.y > 13)\r
+    {\r
+        sum = VecTraits<value_type>::all(0);\r
  \r
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];\r
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];\r
-            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];\r
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];\r
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];\r
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];\r
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];\r
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];\r
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];\r
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];\r
  \r
-            smem2[threadIdx.y][tidx] = sum;\r
-        }\r
+        smem2[4 + threadIdx.y][tidx] = sum;\r
+    }\r
  \r
-        if (threadIdx.y > 13)\r
-        {\r
-            sum = VecTraits<value_type>::all(0);\r
+    __syncthreads();\r
  \r
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];\r
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];\r
-            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];\r
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];\r
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];\r
+    sum = VecTraits<value_type>::all(0);\r
  \r
-            smem2[4 + threadIdx.y][tidx] = sum;\r
-        }\r
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];\r
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];\r
+    sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];\r
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];\r
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];\r
  \r
-        __syncthreads();\r
+    if (x < dst.cols && y < dst.rows)\r
+        dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);\r
+}\r
  \r
-        sum = VecTraits<value_type>::all(0);\r
+template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+{\r
+    const dim3 block(16, 16);\r
+    const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
  \r
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];\r
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];\r
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];\r
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];\r
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];\r
+    B<T> b(src.rows, src.cols);\r
  \r
-        if (x < dst.cols && y < dst.rows)\r
-            dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);\r
-    }\r
+    pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
+{\r
+    typedef typename TypeVec<T, cn>::vec_type type;\r
  \r
-    template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
+\r
+    static const caller_t callers[] = \r
      {\r
-        const dim3 block(16, 16);\r
-        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+        pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>\r
+    };\r
  \r
-        B<T> b(src.rows, src.cols);\r
+    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
+}\r
  \r
-        pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);\r
-        cudaSafeCall( cudaGetLastError() );\r
+template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
-    template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)\r
-    {\r
-        typedef typename TypeVec<T, cn>::vec_type type;\r
+template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
-        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);\r
+template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
-        static const caller_t callers[] = \r
-        {\r
-            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>\r
-        };\r
+template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
-        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);\r
-    }\r
+template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
+\r
+} // namespace pyr_up\r
  \r
-    template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-\r
-    template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-    template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu

index 2f5f3cf..0fda25e 100644 (file)
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -47,64 +47,62 @@
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  #include "opencv2/gpu/device/filters.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
-{\r
+namespace remap {\r
      \r
-    template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)\r
-    {\r
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)\r
+{\r
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        if (x < dst.cols && y < dst.rows)\r
-        {\r
-            const float xcoo = mapx.ptr(y)[x];\r
-            const float ycoo = mapy.ptr(y)[x];\r
+    if (x < dst.cols && y < dst.rows)\r
+    {\r
+        const float xcoo = mapx.ptr(y)[x];\r
+        const float ycoo = mapy.ptr(y)[x];\r
  \r
-            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
-        }\r
+        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
      }\r
+}\r
  \r
-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream\r
+template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream\r
+{\r
+    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
+        const float* borderValue, cudaStream_t stream, int)\r
      {\r
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
-            const float* borderValue, cudaStream_t stream, int)\r
-        {\r
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
-            \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
-            BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
-            Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
-\r
-            remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);\r
-            cudaSafeCall( cudaGetLastError() );\r
-        }\r
-    };\r
-    \r
-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream\r
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
+        \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+\r
+        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
+        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
+        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
+\r
+        remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);\r
+        cudaSafeCall( cudaGetLastError() );\r
+    }\r
+};\r
+\r
+template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream\r
+{\r
+    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)\r
      {\r
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)\r
-        {\r
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
-            \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; \r
+        \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
  \r
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
-            BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
-            Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
+        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));\r
+        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);\r
+        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);\r
  \r
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst);\r
-            cudaSafeCall( cudaGetLastError() );\r
+        remap<<<grid, block>>>(filter_src, mapx, mapy, dst);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+    }\r
+};\r
  \r
  #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \\r
      texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \\r
@@ -124,7 +122,7 @@ namespace cv { namespace gpu { namespace imgproc
              typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \\r
              dim3 block(32, cc >= 20 ? 8 : 4); \\r
              dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
-            TextureBinder texHandler(&tex_remap_ ## type , src); \\r
+            bindTexture(&tex_remap_ ## type , src); \\r
              tex_remap_ ## type ##_reader texSrc; \\r
              B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \\r
              BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \\r
@@ -140,7 +138,7 @@ namespace cv { namespace gpu { namespace imgproc
          { \\r
              dim3 block(32, 8); \\r
              dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
-            TextureBinder texHandler(&tex_remap_ ## type , src); \\r
+            bindTexture(&tex_remap_ ## type , src); \\r
              tex_remap_ ## type ##_reader texSrc; \\r
              Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \\r
              remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \\r
@@ -149,105 +147,108 @@ namespace cv { namespace gpu { namespace imgproc
          } \\r
      };\r
      \r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)\r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)\r
-    \r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)\r
-    \r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)\r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)\r
-    \r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)\r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)\r
-    \r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)\r
-    \r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)\r
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)\r
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)\r
-    \r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)\r
+\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)\r
+\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)\r
+\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)\r
+\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)\r
+\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)\r
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)\r
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)\r
+\r
  #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX\r
  \r
-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher\r
-    { \r
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
-            const float* borderValue, cudaStream_t stream, int cc)\r
-        {\r
-            if (stream == 0)\r
-                RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);\r
-            else\r
-                RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);\r
+template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher\r
+{ \r
+    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, \r
+        const float* borderValue, cudaStream_t stream, int cc)\r
+    {\r
+        if (stream == 0)\r
+            RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);\r
+        else\r
+            RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);\r
+    }\r
+};\r
+\r
+template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, \r
+    int borderMode, const float* borderValue, cudaStream_t stream, int cc)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, \r
+        const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+    static const caller_t callers[3][5] = \r
+    {\r
+        { \r
+            RemapDispatcher<PointFilter, BrdReflect101, T>::call, \r
+            RemapDispatcher<PointFilter, BrdReplicate, T>::call, \r
+            RemapDispatcher<PointFilter, BrdConstant, T>::call, \r
+            RemapDispatcher<PointFilter, BrdReflect, T>::call, \r
+            RemapDispatcher<PointFilter, BrdWrap, T>::call \r
+        },\r
+        { \r
+            RemapDispatcher<LinearFilter, BrdReflect101, T>::call, \r
+            RemapDispatcher<LinearFilter, BrdReplicate, T>::call, \r
+            RemapDispatcher<LinearFilter, BrdConstant, T>::call, \r
+            RemapDispatcher<LinearFilter, BrdReflect, T>::call, \r
+            RemapDispatcher<LinearFilter, BrdWrap, T>::call \r
+        },\r
+        { \r
+            RemapDispatcher<CubicFilter, BrdReflect101, T>::call, \r
+            RemapDispatcher<CubicFilter, BrdReplicate, T>::call, \r
+            RemapDispatcher<CubicFilter, BrdConstant, T>::call, \r
+            RemapDispatcher<CubicFilter, BrdReflect, T>::call, \r
+            RemapDispatcher<CubicFilter, BrdWrap, T>::call \r
          }\r
      };\r
  \r
-    template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, \r
-        int borderMode, const float* borderValue, cudaStream_t stream, int cc)\r
-    {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, \r
-            const float* borderValue, cudaStream_t stream, int cc);\r
-\r
-        static const caller_t callers[3][5] = \r
-        {\r
-            { \r
-                RemapDispatcher<PointFilter, BrdReflect101, T>::call, \r
-                RemapDispatcher<PointFilter, BrdReplicate, T>::call, \r
-                RemapDispatcher<PointFilter, BrdConstant, T>::call, \r
-                RemapDispatcher<PointFilter, BrdReflect, T>::call, \r
-                RemapDispatcher<PointFilter, BrdWrap, T>::call \r
-            },\r
-            { \r
-                RemapDispatcher<LinearFilter, BrdReflect101, T>::call, \r
-                RemapDispatcher<LinearFilter, BrdReplicate, T>::call, \r
-                RemapDispatcher<LinearFilter, BrdConstant, T>::call, \r
-                RemapDispatcher<LinearFilter, BrdReflect, T>::call, \r
-                RemapDispatcher<LinearFilter, BrdWrap, T>::call \r
-            },\r
-            { \r
-                RemapDispatcher<CubicFilter, BrdReflect101, T>::call, \r
-                RemapDispatcher<CubicFilter, BrdReplicate, T>::call, \r
-                RemapDispatcher<CubicFilter, BrdConstant, T>::call, \r
-                RemapDispatcher<CubicFilter, BrdReflect, T>::call, \r
-                RemapDispatcher<CubicFilter, BrdWrap, T>::call \r
-            }\r
-        };\r
-\r
-        callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);\r
-    }\r
+    callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);\r
+}\r
  \r
-    template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    \r
-    //template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    \r
-    template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    \r
-    template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    \r
-    //template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    \r
-    template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    //template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-    template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-}}}\r
+template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+\r
+} // namespace remap\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu

index 78871f7..b797162 100644 (file)
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -47,102 +47,100 @@
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  #include "opencv2/gpu/device/filters.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
-{\r
+namespace resize {\r
      \r
-    template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
-    {\r
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
+{\r
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        if (x < dst.cols && y < dst.rows)\r
-        {\r
-            const float xcoo = x / fx;\r
-            const float ycoo = y / fy;\r
+    if (x < dst.cols && y < dst.rows)\r
+    {\r
+        const float xcoo = x / fx;\r
+        const float ycoo = y / fy;\r
  \r
-            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
-        }\r
+        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));\r
      }\r
-    template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
+}\r
+template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)\r
+{\r
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+\r
+    if (x < dst.cols && y < dst.rows)\r
      {\r
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
+        const float xcoo = x / fx;\r
+        const float ycoo = y / fy;\r
+\r
+        dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));\r
+    }\r
+}\r
+\r
+template <template <typename> class Filter, typename T> struct ResizeDispatcherStream\r
+{\r
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+    {            \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
  \r
-        if (x < dst.cols && y < dst.rows)\r
-        {\r
-            const float xcoo = x / fx;\r
-            const float ycoo = y / fy;\r
+        BrdReplicate<T> brd(src.rows, src.cols);\r
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
  \r
-            dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));\r
-        }\r
+        resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);\r
+        cudaSafeCall( cudaGetLastError() );\r
      }\r
+};\r
+template <typename T> struct ResizeDispatcherStream<PointFilter, T>\r
+{\r
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+    {            \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
  \r
-    template <template <typename> class Filter, typename T> struct ResizeDispatcherStream\r
-    {\r
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-        {            \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
-\r
-            BrdReplicate<T> brd(src.rows, src.cols);\r
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
-            Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
-\r
-            resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);\r
-            cudaSafeCall( cudaGetLastError() );\r
-        }\r
-    };\r
-    template <typename T> struct ResizeDispatcherStream<PointFilter, T>\r
-    {\r
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-        {            \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+        BrdReplicate<T> brd(src.rows, src.cols);\r
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
  \r
-            BrdReplicate<T> brd(src.rows, src.cols);\r
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+        resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);\r
+        cudaSafeCall( cudaGetLastError() );\r
+    }\r
+};\r
  \r
-            resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);\r
-            cudaSafeCall( cudaGetLastError() );\r
-        }\r
-    };\r
-    \r
-    template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream\r
-    {\r
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
-        {            \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream\r
+{\r
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
+    {            \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
  \r
-            BrdReplicate<T> brd(src.rows, src.cols);\r
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
-            Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
+        BrdReplicate<T> brd(src.rows, src.cols);\r
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);\r
  \r
-            resize<<<grid, block>>>(filter_src, fx, fy, dst);\r
-            cudaSafeCall( cudaGetLastError() );\r
+        resize<<<grid, block>>>(filter_src, fx, fy, dst);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-    template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>\r
-    {\r
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
-        {            \r
-            dim3 block(32, 8);\r
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+    }\r
+};\r
+template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>\r
+{\r
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)\r
+    {            \r
+        dim3 block(32, 8);\r
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));\r
  \r
-            BrdReplicate<T> brd(src.rows, src.cols);\r
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
+        BrdReplicate<T> brd(src.rows, src.cols);\r
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);\r
  \r
-            resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);\r
-            cudaSafeCall( cudaGetLastError() );\r
+        resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+    }\r
+};\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \\r
      texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \\r
@@ -161,7 +159,7 @@ namespace cv { namespace gpu { namespace imgproc
          { \\r
              dim3 block(32, 8); \\r
              dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
-            TextureBinder texHandler(&tex_resize_ ## type , src); \\r
+            bindTexture(&tex_resize_ ## type , src); \\r
              tex_resize_ ## type ##_reader texSrc; \\r
              Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \\r
              resize<<<grid, block>>>(filter_src, fx, fy, dst); \\r
@@ -175,7 +173,7 @@ namespace cv { namespace gpu { namespace imgproc
          { \\r
              dim3 block(32, 8); \\r
              dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \\r
-            TextureBinder texHandler(&tex_resize_ ## type , src); \\r
+            bindTexture(&tex_resize_ ## type , src); \\r
              tex_resize_ ## type ##_reader texSrc; \\r
              resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \\r
              cudaSafeCall( cudaGetLastError() ); \\r
@@ -183,82 +181,85 @@ namespace cv { namespace gpu { namespace imgproc
          } \\r
      };\r
      \r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)\r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)\r
-    \r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)\r
-    \r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)\r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)\r
-    \r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)\r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)\r
-    \r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)\r
-    \r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)\r
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)\r
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)\r
-    \r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)\r
+\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)\r
+\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)\r
+\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)\r
+\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)\r
+\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)\r
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)\r
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)\r
+\r
  #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX\r
  \r
-    template <template <typename> class Filter, typename T> struct ResizeDispatcher\r
-    { \r
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
-        {\r
-            if (stream == 0)\r
-                ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);\r
-            else\r
-                ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);\r
-        }\r
-    };\r
+template <template <typename> class Filter, typename T> struct ResizeDispatcher\r
+{ \r
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)\r
+    {\r
+        if (stream == 0)\r
+            ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);\r
+        else\r
+            ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);\r
+    }\r
+};\r
+\r
+template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);\r
  \r
-    template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)\r
+    static const caller_t callers[3] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+        ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call\r
+    };\r
  \r
-        static const caller_t callers[3] = \r
-        {\r
-            ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call\r
-        };\r
+    callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);\r
+}\r
  \r
-        callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);\r
-    }\r
+template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
  \r
-    template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    \r
-    //template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    \r
-    template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    \r
-    template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    \r
-    //template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    \r
-    template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    //template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-    template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-}}}\r
+//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
+\r
+} // namespace resize\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu

index cdcee7a..e185669 100644 (file)
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -47,8 +47,7 @@
  #include "opencv2/gpu/device/limits.hpp"\r
  #include "opencv2/gpu/device/border_interpolate.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
  #define MAX_KERNEL_SIZE 16\r
  #define BLOCK_DIM_X 16\r
@@ -56,218 +55,218 @@ using namespace cv::gpu::device;
  #define RESULT_STEPS 8\r
  #define HALO_STEPS 1\r
  \r
-namespace filter_row\r
+namespace row_filter {\r
+\r
+__constant__ float c_kernel[MAX_KERNEL_SIZE];\r
+\r
+void loadKernel(const float kernel[], int ksize)\r
  {\r
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
+}\r
  \r
-    void loadKernel(const float kernel[], int ksize)\r
+namespace detail\r
+{\r
+    template <typename T, size_t size> struct SmemType\r
      {\r
-        cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );\r
-    }\r
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;\r
+    };\r
  \r
-    namespace detail\r
+    template <typename T> struct SmemType<T, 4>\r
      {\r
-        template <typename T, size_t size> struct SmemType\r
-        {\r
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;\r
-        };\r
+        typedef T smem_t;\r
+    };\r
+}\r
  \r
-        template <typename T> struct SmemType<T, 4>\r
-        {\r
-            typedef T smem_t;\r
-        };\r
-    }\r
+template <typename T> struct SmemType\r
+{\r
+    typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;\r
+};\r
  \r
-    template <typename T> struct SmemType\r
-    {\r
-        typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;\r
-    };\r
+template <int KERNEL_SIZE, typename T, typename D, typename B>\r
+__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
+{\r
+    typedef typename SmemType<T>::smem_t smem_t;\r
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
  \r
-    template <int KERNEL_SIZE, typename T, typename D, typename B>\r
-    __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)\r
-    {\r
-        typedef typename SmemType<T>::smem_t smem_t;\r
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;\r
+    __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];\r
  \r
-        __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];\r
+    //Offset to the left halo edge\r
+    const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;\r
+    const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
  \r
-        //Offset to the left halo edge\r
-        const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;\r
-        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+    if (y < src.rows)\r
+    {\r
+        const T* src_row = src.ptr(y);\r
  \r
-        if (y < src.rows)\r
-        {\r
-            const T* src_row = src.ptr(y);\r
+        //Load main data\r
+        #pragma unroll\r
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
  \r
-            //Load main data\r
-            #pragma unroll\r
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
-                smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
+        //Load left halo\r
+        #pragma unroll\r
+        for(int i = 0; i < HALO_STEPS; ++i)\r
+            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);\r
  \r
-            //Load left halo\r
-            #pragma unroll\r
-            for(int i = 0; i < HALO_STEPS; ++i)\r
-                smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);\r
+        //Load right halo\r
+        #pragma unroll\r
+        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
+            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
  \r
-            //Load right halo\r
-            #pragma unroll\r
-            for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)\r
-                smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);\r
+        __syncthreads();\r
  \r
-            __syncthreads();\r
+        D* dst_row = dst.ptr(y);\r
  \r
-            D* dst_row = dst.ptr(y);\r
+        #pragma unroll\r
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
+        {\r
+            sum_t sum = VecTraits<sum_t>::all(0);\r
  \r
              #pragma unroll\r
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)\r
-            {\r
-                sum_t sum = VecTraits<sum_t>::all(0);\r
+            for (int j = 0; j < KERNEL_SIZE; ++j)\r
+                sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];\r
  \r
-                #pragma unroll\r
-                for (int j = 0; j < KERNEL_SIZE; ++j)\r
-                    sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];\r
+            int dstX = x + i * BLOCK_DIM_X;\r
  \r
-                int dstX = x + i * BLOCK_DIM_X;\r
-\r
-                if (dstX < src.cols)\r
-                    dst_row[dstX] = saturate_cast<D>(sum);\r
-            }\r
+            if (dstX < src.cols)\r
+                dst_row[dstX] = saturate_cast<D>(sum);\r
          }\r
      }\r
  }\r
  \r
-namespace cv { namespace gpu { namespace filters\r
+template <int ksize, typename T, typename D, template<typename> class B>\r
+void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
  {\r
-    template <int ksize, typename T, typename D, template<typename> class B>\r
-    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)\r
-    {\r
-        typedef typename filter_row::SmemType<T>::smem_t smem_t;\r
+    typedef typename SmemType<T>::smem_t smem_t;\r
  \r
-        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
-        const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
+    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);\r
+    const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));\r
  \r
-        B<smem_t> b(src.cols);\r
+    B<smem_t> b(src.cols);\r
  \r
-        filter_row::linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <typename T, typename D>\r
-    void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
+template <typename T, typename D>\r
+void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)\r
+{\r
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
+    static const caller_t callers[5][17] = \r
      {\r
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);\r
-        static const caller_t callers[5][17] = \r
          {\r
-            {\r
-                0, \r
-                linearRowFilter_caller<1 , T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<2 , T, D, BrdRowReflect101>,\r
-                linearRowFilter_caller<3 , T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<4 , T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<5 , T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<6 , T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<7 , T, D, BrdRowReflect101>,\r
-                linearRowFilter_caller<8 , T, D, BrdRowReflect101>,\r
-                linearRowFilter_caller<9 , T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<10, T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<11, T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<12, T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<13, T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<14, T, D, BrdRowReflect101>,\r
-                linearRowFilter_caller<15, T, D, BrdRowReflect101>, \r
-                linearRowFilter_caller<16, T, D, BrdRowReflect101>\r
-            },\r
-            {\r
-                0, \r
-                linearRowFilter_caller<1 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<2 , T, D, BrdRowReplicate>,\r
-                linearRowFilter_caller<3 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<4 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<5 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<6 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<7 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<8 , T, D, BrdRowReplicate>,\r
-                linearRowFilter_caller<9 , T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<10, T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<11, T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<12, T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<13, T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<14, T, D, BrdRowReplicate>,\r
-                linearRowFilter_caller<15, T, D, BrdRowReplicate>, \r
-                linearRowFilter_caller<16, T, D, BrdRowReplicate>\r
-            },\r
-            {\r
-                0, \r
-                linearRowFilter_caller<1 , T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<2 , T, D, BrdRowConstant>,\r
-                linearRowFilter_caller<3 , T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<4 , T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<5 , T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<6 , T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<7 , T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<8 , T, D, BrdRowConstant>,\r
-                linearRowFilter_caller<9 , T, D, BrdRowConstant>,\r
-                linearRowFilter_caller<10, T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<11, T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<12, T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<13, T, D, BrdRowConstant>,\r
-                linearRowFilter_caller<14, T, D, BrdRowConstant>,\r
-                linearRowFilter_caller<15, T, D, BrdRowConstant>, \r
-                linearRowFilter_caller<16, T, D, BrdRowConstant>\r
-            },\r
-            {\r
-                0, \r
-                linearRowFilter_caller<1 , T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<2 , T, D, BrdRowReflect>,\r
-                linearRowFilter_caller<3 , T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<4 , T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<5 , T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<6 , T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<7 , T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<8 , T, D, BrdRowReflect>,\r
-                linearRowFilter_caller<9 , T, D, BrdRowReflect>,\r
-                linearRowFilter_caller<10, T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<11, T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<12, T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<13, T, D, BrdRowReflect>,\r
-                linearRowFilter_caller<14, T, D, BrdRowReflect>,\r
-                linearRowFilter_caller<15, T, D, BrdRowReflect>, \r
-                linearRowFilter_caller<16, T, D, BrdRowReflect>\r
-            },\r
-            {\r
-                0, \r
-                linearRowFilter_caller<1 , T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<2 , T, D, BrdRowWrap>,\r
-                linearRowFilter_caller<3 , T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<4 , T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<5 , T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<6 , T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<7 , T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<8 , T, D, BrdRowWrap>,\r
-                linearRowFilter_caller<9 , T, D, BrdRowWrap>,\r
-                linearRowFilter_caller<10, T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<11, T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<12, T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<13, T, D, BrdRowWrap>,\r
-                linearRowFilter_caller<14, T, D, BrdRowWrap>,\r
-                linearRowFilter_caller<15, T, D, BrdRowWrap>, \r
-                linearRowFilter_caller<16, T, D, BrdRowWrap>\r
-            }\r
-        };\r
-        \r
-        filter_row::loadKernel(kernel, ksize);\r
-\r
-        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
-    }\r
+            0, \r
+            linearRowFilter_caller<1 , T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<2 , T, D, BrdRowReflect101>,\r
+            linearRowFilter_caller<3 , T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<4 , T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<5 , T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<6 , T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<7 , T, D, BrdRowReflect101>,\r
+            linearRowFilter_caller<8 , T, D, BrdRowReflect101>,\r
+            linearRowFilter_caller<9 , T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<10, T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<11, T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<12, T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<13, T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<14, T, D, BrdRowReflect101>,\r
+            linearRowFilter_caller<15, T, D, BrdRowReflect101>, \r
+            linearRowFilter_caller<16, T, D, BrdRowReflect101>\r
+        },\r
+        {\r
+            0, \r
+            linearRowFilter_caller<1 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<2 , T, D, BrdRowReplicate>,\r
+            linearRowFilter_caller<3 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<4 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<5 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<6 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<7 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<8 , T, D, BrdRowReplicate>,\r
+            linearRowFilter_caller<9 , T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<10, T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<11, T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<12, T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<13, T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<14, T, D, BrdRowReplicate>,\r
+            linearRowFilter_caller<15, T, D, BrdRowReplicate>, \r
+            linearRowFilter_caller<16, T, D, BrdRowReplicate>\r
+        },\r
+        {\r
+            0, \r
+            linearRowFilter_caller<1 , T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<2 , T, D, BrdRowConstant>,\r
+            linearRowFilter_caller<3 , T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<4 , T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<5 , T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<6 , T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<7 , T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<8 , T, D, BrdRowConstant>,\r
+            linearRowFilter_caller<9 , T, D, BrdRowConstant>,\r
+            linearRowFilter_caller<10, T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<11, T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<12, T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<13, T, D, BrdRowConstant>,\r
+            linearRowFilter_caller<14, T, D, BrdRowConstant>,\r
+            linearRowFilter_caller<15, T, D, BrdRowConstant>, \r
+            linearRowFilter_caller<16, T, D, BrdRowConstant>\r
+        },\r
+        {\r
+            0, \r
+            linearRowFilter_caller<1 , T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<2 , T, D, BrdRowReflect>,\r
+            linearRowFilter_caller<3 , T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<4 , T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<5 , T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<6 , T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<7 , T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<8 , T, D, BrdRowReflect>,\r
+            linearRowFilter_caller<9 , T, D, BrdRowReflect>,\r
+            linearRowFilter_caller<10, T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<11, T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<12, T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<13, T, D, BrdRowReflect>,\r
+            linearRowFilter_caller<14, T, D, BrdRowReflect>,\r
+            linearRowFilter_caller<15, T, D, BrdRowReflect>, \r
+            linearRowFilter_caller<16, T, D, BrdRowReflect>\r
+        },\r
+        {\r
+            0, \r
+            linearRowFilter_caller<1 , T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<2 , T, D, BrdRowWrap>,\r
+            linearRowFilter_caller<3 , T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<4 , T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<5 , T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<6 , T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<7 , T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<8 , T, D, BrdRowWrap>,\r
+            linearRowFilter_caller<9 , T, D, BrdRowWrap>,\r
+            linearRowFilter_caller<10, T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<11, T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<12, T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<13, T, D, BrdRowWrap>,\r
+            linearRowFilter_caller<14, T, D, BrdRowWrap>,\r
+            linearRowFilter_caller<15, T, D, BrdRowWrap>, \r
+            linearRowFilter_caller<16, T, D, BrdRowWrap>\r
+        }\r
+    };\r
+    \r
+    loadKernel(kernel, ksize);\r
+\r
+    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);\r
+}\r
+\r
+template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+\r
+} // namespace row_filter\r
  \r
-    template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    //template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    //template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-    template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/safe_call.hpp b/modules/gpu/src/cuda/safe_call.hpp

index 6a7d577..a3dc2f4 100644 (file)
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -43,9 +43,9 @@
  #ifndef __OPENCV_CUDA_SAFE_CALL_HPP__\r
  #define __OPENCV_CUDA_SAFE_CALL_HPP__\r
  \r
-#include "cuda_runtime_api.h"\r
-#include "cufft.h"\r
-#include "cublas.h"\r
+#include <cuda_runtime_api.h>\r
+#include <cufft.h>\r
+#include <cublas.h>\r
  #include "NCV.hpp"\r
  \r
  #if defined(__GNUC__)\r
@@ -62,46 +62,44 @@
      #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)\r
  #endif\r
  \r
-namespace cv\r
-{\r
-    namespace gpu\r
-    {\r
-        void error(const char *error_string, const char *file, const int line, const char *func = "");\r
-        void nppError(int err, const char *file, const int line, const char *func = "");\r
-        void ncvError(int err, const char *file, const int line, const char *func = "");\r
-        void cufftError(int err, const char *file, const int line, const char *func = "");\r
-        void cublasError(int err, const char *file, const int line, const char *func = "");\r
+namespace cv { namespace gpu {\r
+\r
+void error(const char *error_string, const char *file, const int line, const char *func = "");\r
+void nppError(int err, const char *file, const int line, const char *func = "");\r
+void ncvError(int err, const char *file, const int line, const char *func = "");\r
+void cufftError(int err, const char *file, const int line, const char *func = "");\r
+void cublasError(int err, const char *file, const int line, const char *func = "");\r
  \r
-        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")\r
-        {\r
-            if (cudaSuccess != err)\r
-                cv::gpu::error(cudaGetErrorString(err), file, line, func);\r
-        }\r
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")\r
+{\r
+    if (cudaSuccess != err)\r
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);\r
+}\r
  \r
-        static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")\r
-        {\r
-            if (err < 0)\r
-                cv::gpu::nppError(err, file, line, func);\r
-        }\r
+static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")\r
+{\r
+    if (err < 0)\r
+        cv::gpu::nppError(err, file, line, func);\r
+}\r
  \r
-        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")\r
-        {\r
-            if (NCV_SUCCESS != err)\r
-                cv::gpu::ncvError(err, file, line, func);\r
-        }\r
+static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")\r
+{\r
+    if (NCV_SUCCESS != err)\r
+        cv::gpu::ncvError(err, file, line, func);\r
+}\r
  \r
-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")\r
-        {\r
-            if (CUFFT_SUCCESS != err)\r
-                cv::gpu::cufftError(err, file, line, func);\r
-        }\r
+static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")\r
+{\r
+    if (CUFFT_SUCCESS != err)\r
+        cv::gpu::cufftError(err, file, line, func);\r
+}\r
  \r
-        static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")\r
-        {\r
-            if (CUBLAS_STATUS_SUCCESS != err)\r
-                cv::gpu::cublasError(err, file, line, func);\r
-        }\r
-    }\r
+static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")\r
+{\r
+    if (CUBLAS_STATUS_SUCCESS != err)\r
+        cv::gpu::cublasError(err, file, line, func);\r
  }\r
  \r
+}}\r
+\r
  #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
 \ No newline at end of file
diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu

index 25bc294..92700ed 100644 (file)
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -42,465 +42,467 @@
  \r
  #include "internal_shared.hpp"\r
  \r
-namespace cv { namespace gpu { namespace split_merge {\r
-\r
-    template <typename T, size_t elem_size = sizeof(T)>\r
-    struct TypeTraits \r
-    {\r
-        typedef T type;\r
-        typedef T type2;\r
-        typedef T type3;\r
-        typedef T type4;\r
-    };\r
-\r
-    template <typename T>\r
-    struct TypeTraits<T, 1>\r
-    {\r
-        typedef char type;\r
-        typedef char2 type2;\r
-        typedef char3 type3;\r
-        typedef char4 type4;\r
-    };\r
-\r
-    template <typename T>\r
-    struct TypeTraits<T, 2>\r
-    {\r
-        typedef short type;\r
-        typedef short2 type2;\r
-        typedef short3 type3;\r
-        typedef short4 type4;\r
-    };\r
-\r
-    template <typename T>\r
-    struct TypeTraits<T, 4> \r
-    {\r
-        typedef int type;\r
-        typedef int2 type2;\r
-        typedef int3 type3;\r
-        typedef int4 type4;\r
-    };\r
-\r
-    template <typename T>\r
-    struct TypeTraits<T, 8> \r
-    {\r
-        typedef double type;\r
-        typedef double2 type2;\r
-        //typedef double3 type3;\r
-        //typedef double4 type3;\r
-    };\r
-\r
-    typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);\r
-    typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);\r
-\r
-    //------------------------------------------------------------\r
-    // Merge    \r
-\r
-    template <typename T>\r
-    __global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
-                             const uchar* src1, size_t src1_step, \r
-                             int rows, int cols, uchar* dst, size_t dst_step)\r
-    {\r
-        typedef typename TypeTraits<T>::type2 dst_type;\r
-\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const T* src0_y = (const T*)(src0 + y * src0_step);\r
-        const T* src1_y = (const T*)(src1 + y * src1_step);\r
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {                        \r
-            dst_type dst_elem;\r
-            dst_elem.x = src0_y[x];\r
-            dst_elem.y = src1_y[x];\r
-            dst_y[x] = dst_elem;\r
-        }\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace split_merge {\r
+\r
+template <typename T, size_t elem_size = sizeof(T)>\r
+struct TypeTraits \r
+{\r
+    typedef T type;\r
+    typedef T type2;\r
+    typedef T type3;\r
+    typedef T type4;\r
+};\r
+\r
+template <typename T>\r
+struct TypeTraits<T, 1>\r
+{\r
+    typedef char type;\r
+    typedef char2 type2;\r
+    typedef char3 type3;\r
+    typedef char4 type4;\r
+};\r
+\r
+template <typename T>\r
+struct TypeTraits<T, 2>\r
+{\r
+    typedef short type;\r
+    typedef short2 type2;\r
+    typedef short3 type3;\r
+    typedef short4 type4;\r
+};\r
+\r
+template <typename T>\r
+struct TypeTraits<T, 4> \r
+{\r
+    typedef int type;\r
+    typedef int2 type2;\r
+    typedef int3 type3;\r
+    typedef int4 type4;\r
+};\r
+\r
+template <typename T>\r
+struct TypeTraits<T, 8> \r
+{\r
+    typedef double type;\r
+    typedef double2 type2;\r
+    //typedef double3 type3;\r
+    //typedef double4 type3;\r
+};\r
+\r
+typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);\r
+typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);\r
+\r
+//------------------------------------------------------------\r
+// Merge    \r
+\r
+template <typename T>\r
+__global__ void mergeC2_(const uchar* src0, size_t src0_step, \r
+                         const uchar* src1, size_t src1_step, \r
+                         int rows, int cols, uchar* dst, size_t dst_step)\r
+{\r
+    typedef typename TypeTraits<T>::type2 dst_type;\r
+\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    const T* src0_y = (const T*)(src0 + y * src0_step);\r
+    const T* src1_y = (const T*)(src1 + y * src1_step);\r
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+    if (x < cols && y < rows) \r
+    {                        \r
+        dst_type dst_elem;\r
+        dst_elem.x = src0_y[x];\r
+        dst_elem.y = src1_y[x];\r
+        dst_y[x] = dst_elem;\r
      }\r
-\r
-\r
-    template <typename T>\r
-    __global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
-                             const uchar* src1, size_t src1_step, \r
-                             const uchar* src2, size_t src2_step, \r
-                             int rows, int cols, uchar* dst, size_t dst_step)\r
-    {\r
-        typedef typename TypeTraits<T>::type3 dst_type;\r
-\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const T* src0_y = (const T*)(src0 + y * src0_step);\r
-        const T* src1_y = (const T*)(src1 + y * src1_step);\r
-        const T* src2_y = (const T*)(src2 + y * src2_step);\r
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {                        \r
-            dst_type dst_elem;\r
-            dst_elem.x = src0_y[x];\r
-            dst_elem.y = src1_y[x];\r
-            dst_elem.z = src2_y[x];\r
-            dst_y[x] = dst_elem;\r
-        }\r
+}\r
+\r
+\r
+template <typename T>\r
+__global__ void mergeC3_(const uchar* src0, size_t src0_step, \r
+                         const uchar* src1, size_t src1_step, \r
+                         const uchar* src2, size_t src2_step, \r
+                         int rows, int cols, uchar* dst, size_t dst_step)\r
+{\r
+    typedef typename TypeTraits<T>::type3 dst_type;\r
+\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    const T* src0_y = (const T*)(src0 + y * src0_step);\r
+    const T* src1_y = (const T*)(src1 + y * src1_step);\r
+    const T* src2_y = (const T*)(src2 + y * src2_step);\r
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+    if (x < cols && y < rows) \r
+    {                        \r
+        dst_type dst_elem;\r
+        dst_elem.x = src0_y[x];\r
+        dst_elem.y = src1_y[x];\r
+        dst_elem.z = src2_y[x];\r
+        dst_y[x] = dst_elem;\r
      }\r
-\r
-\r
-    template <>\r
-    __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
-                             const uchar* src1, size_t src1_step, \r
-                             const uchar* src2, size_t src2_step, \r
-                             int rows, int cols, uchar* dst, size_t dst_step)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const double* src0_y = (const double*)(src0 + y * src0_step);\r
-        const double* src1_y = (const double*)(src1 + y * src1_step);\r
-        const double* src2_y = (const double*)(src2 + y * src2_step);\r
-        double* dst_y = (double*)(dst + y * dst_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {                        \r
-            dst_y[3 * x] = src0_y[x];\r
-            dst_y[3 * x + 1] = src1_y[x];\r
-            dst_y[3 * x + 2] = src2_y[x];\r
-        }\r
+}\r
+\r
+\r
+template <>\r
+__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, \r
+                         const uchar* src1, size_t src1_step, \r
+                         const uchar* src2, size_t src2_step, \r
+                         int rows, int cols, uchar* dst, size_t dst_step)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    const double* src0_y = (const double*)(src0 + y * src0_step);\r
+    const double* src1_y = (const double*)(src1 + y * src1_step);\r
+    const double* src2_y = (const double*)(src2 + y * src2_step);\r
+    double* dst_y = (double*)(dst + y * dst_step);\r
+\r
+    if (x < cols && y < rows) \r
+    {                        \r
+        dst_y[3 * x] = src0_y[x];\r
+        dst_y[3 * x + 1] = src1_y[x];\r
+        dst_y[3 * x + 2] = src2_y[x];\r
      }\r
-\r
-\r
-    template <typename T>\r
-    __global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
-                             const uchar* src1, size_t src1_step, \r
-                             const uchar* src2, size_t src2_step, \r
-                             const uchar* src3, size_t src3_step, \r
-                             int rows, int cols, uchar* dst, size_t dst_step)\r
-    {\r
-        typedef typename TypeTraits<T>::type4 dst_type;\r
-\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const T* src0_y = (const T*)(src0 + y * src0_step);\r
-        const T* src1_y = (const T*)(src1 + y * src1_step);\r
-        const T* src2_y = (const T*)(src2 + y * src2_step);\r
-        const T* src3_y = (const T*)(src3 + y * src3_step);\r
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {                        \r
-            dst_type dst_elem;\r
-            dst_elem.x = src0_y[x];\r
-            dst_elem.y = src1_y[x];\r
-            dst_elem.z = src2_y[x];\r
-            dst_elem.w = src3_y[x];\r
-            dst_y[x] = dst_elem;\r
-        }\r
+}\r
+\r
+\r
+template <typename T>\r
+__global__ void mergeC4_(const uchar* src0, size_t src0_step, \r
+                         const uchar* src1, size_t src1_step, \r
+                         const uchar* src2, size_t src2_step, \r
+                         const uchar* src3, size_t src3_step, \r
+                         int rows, int cols, uchar* dst, size_t dst_step)\r
+{\r
+    typedef typename TypeTraits<T>::type4 dst_type;\r
+\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    const T* src0_y = (const T*)(src0 + y * src0_step);\r
+    const T* src1_y = (const T*)(src1 + y * src1_step);\r
+    const T* src2_y = (const T*)(src2 + y * src2_step);\r
+    const T* src3_y = (const T*)(src3 + y * src3_step);\r
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);\r
+\r
+    if (x < cols && y < rows) \r
+    {                        \r
+        dst_type dst_elem;\r
+        dst_elem.x = src0_y[x];\r
+        dst_elem.y = src1_y[x];\r
+        dst_elem.z = src2_y[x];\r
+        dst_elem.w = src3_y[x];\r
+        dst_y[x] = dst_elem;\r
      }\r
-\r
-\r
-    template <>\r
-    __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
-                             const uchar* src1, size_t src1_step, \r
-                             const uchar* src2, size_t src2_step, \r
-                             const uchar* src3, size_t src3_step, \r
-                             int rows, int cols, uchar* dst, size_t dst_step)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const double* src0_y = (const double*)(src0 + y * src0_step);\r
-        const double* src1_y = (const double*)(src1 + y * src1_step);\r
-        const double* src2_y = (const double*)(src2 + y * src2_step);\r
-        const double* src3_y = (const double*)(src3 + y * src3_step);\r
-        double2* dst_y = (double2*)(dst + y * dst_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {                        \r
-            dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
-            dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
-        }\r
-    }\r
-\r
-\r
-    template <typename T>\r
-    static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
-    {\r
-        dim3 blockDim(32, 8);\r
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
-        mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
-                src[0].data, src[0].step,\r
-                src[1].data, src[1].step,\r
-                dst.rows, dst.cols, dst.data, dst.step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+template <>\r
+__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, \r
+                         const uchar* src1, size_t src1_step, \r
+                         const uchar* src2, size_t src2_step, \r
+                         const uchar* src3, size_t src3_step, \r
+                         int rows, int cols, uchar* dst, size_t dst_step)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    const double* src0_y = (const double*)(src0 + y * src0_step);\r
+    const double* src1_y = (const double*)(src1 + y * src1_step);\r
+    const double* src2_y = (const double*)(src2 + y * src2_step);\r
+    const double* src3_y = (const double*)(src3 + y * src3_step);\r
+    double2* dst_y = (double2*)(dst + y * dst_step);\r
+\r
+    if (x < cols && y < rows) \r
+    {                        \r
+        dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);\r
+        dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);\r
      }\r
-\r
-\r
-    template <typename T>\r
-    static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+}\r
+\r
+\r
+template <typename T>\r
+static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+{\r
+    dim3 blockDim(32, 8);\r
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+    mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+            src[0].data, src[0].step,\r
+            src[1].data, src[1].step,\r
+            dst.rows, dst.cols, dst.data, dst.step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+template <typename T>\r
+static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+{\r
+    dim3 blockDim(32, 8);\r
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+    mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+            src[0].data, src[0].step,\r
+            src[1].data, src[1].step,\r
+            src[2].data, src[2].step,\r
+            dst.rows, dst.cols, dst.data, dst.step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+template <typename T>\r
+static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
+{\r
+    dim3 blockDim(32, 8);\r
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
+    mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+            src[0].data, src[0].step,\r
+            src[1].data, src[1].step,\r
+            src[2].data, src[2].step,\r
+            src[3].data, src[3].step,\r
+            dst.rows, dst.cols, dst.data, dst.step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+void merge_caller(const DevMem2Db* src, DevMem2Db& dst,\r
+                             int total_channels, size_t elem_size,\r
+                             const cudaStream_t& stream)\r
+{\r
+    static MergeFunction merge_func_tbl[] =\r
      {\r
-        dim3 blockDim(32, 8);\r
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
-        mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
-                src[0].data, src[0].step,\r
-                src[1].data, src[1].step,\r
-                src[2].data, src[2].step,\r
-                dst.rows, dst.cols, dst.data, dst.step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+        mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,\r
+        mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,\r
+        mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,\r
+    };\r
  \r
+    size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);\r
+    MergeFunction merge_func = merge_func_tbl[merge_func_id];\r
  \r
-    template <typename T>\r
-    static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)\r
-    {\r
-        dim3 blockDim(32, 8);\r
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));\r
-        mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
-                src[0].data, src[0].step,\r
-                src[1].data, src[1].step,\r
-                src[2].data, src[2].step,\r
-                src[3].data, src[3].step,\r
-                dst.rows, dst.cols, dst.data, dst.step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+    if (merge_func == 0)\r
+        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
  \r
+    merge_func(src, dst, stream);\r
+}\r
  \r
-    extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst,\r
-                                 int total_channels, size_t elem_size,\r
-                                 const cudaStream_t& stream)\r
-    {\r
-        static MergeFunction merge_func_tbl[] =\r
-        {\r
-            mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,\r
-            mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,\r
-            mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,\r
-        };\r
  \r
-        size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);\r
-        MergeFunction merge_func = merge_func_tbl[merge_func_id];\r
  \r
-        if (merge_func == 0)\r
-            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
-\r
-        merge_func(src, dst, stream);\r
-    }\r
+//------------------------------------------------------------\r
+// Split\r
  \r
  \r
+template <typename T>\r
+__global__ void splitC2_(const uchar* src, size_t src_step, \r
+                        int rows, int cols,\r
+                        uchar* dst0, size_t dst0_step,\r
+                        uchar* dst1, size_t dst1_step)\r
+{\r
+    typedef typename TypeTraits<T>::type2 src_type;\r
  \r
-    //------------------------------------------------------------\r
-    // Split\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
+    const src_type* src_y = (const src_type*)(src + y * src_step);\r
+    T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+    T* dst1_y = (T*)(dst1 + y * dst1_step);\r
  \r
-    template <typename T>\r
-    __global__ void splitC2_(const uchar* src, size_t src_step, \r
-                            int rows, int cols,\r
-                            uchar* dst0, size_t dst0_step,\r
-                            uchar* dst1, size_t dst1_step)\r
+    if (x < cols && y < rows) \r
      {\r
-        typedef typename TypeTraits<T>::type2 src_type;\r
+        src_type src_elem = src_y[x];\r
+        dst0_y[x] = src_elem.x;\r
+        dst1_y[x] = src_elem.y;\r
+    }\r
+}\r
  \r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        const src_type* src_y = (const src_type*)(src + y * src_step);\r
-        T* dst0_y = (T*)(dst0 + y * dst0_step);\r
-        T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+template <typename T>\r
+__global__ void splitC3_(const uchar* src, size_t src_step, \r
+                        int rows, int cols,\r
+                        uchar* dst0, size_t dst0_step,\r
+                        uchar* dst1, size_t dst1_step,\r
+                        uchar* dst2, size_t dst2_step)\r
+{\r
+    typedef typename TypeTraits<T>::type3 src_type;\r
  \r
-        if (x < cols && y < rows) \r
-        {\r
-            src_type src_elem = src_y[x];\r
-            dst0_y[x] = src_elem.x;\r
-            dst1_y[x] = src_elem.y;\r
-        }\r
-    }\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
+    const src_type* src_y = (const src_type*)(src + y * src_step);\r
+    T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+    T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+    T* dst2_y = (T*)(dst2 + y * dst2_step);\r
  \r
-    template <typename T>\r
-    __global__ void splitC3_(const uchar* src, size_t src_step, \r
-                            int rows, int cols,\r
-                            uchar* dst0, size_t dst0_step,\r
-                            uchar* dst1, size_t dst1_step,\r
-                            uchar* dst2, size_t dst2_step)\r
+    if (x < cols && y < rows) \r
      {\r
-        typedef typename TypeTraits<T>::type3 src_type;\r
-\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const src_type* src_y = (const src_type*)(src + y * src_step);\r
-        T* dst0_y = (T*)(dst0 + y * dst0_step);\r
-        T* dst1_y = (T*)(dst1 + y * dst1_step);\r
-        T* dst2_y = (T*)(dst2 + y * dst2_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {\r
-            src_type src_elem = src_y[x];\r
-            dst0_y[x] = src_elem.x;\r
-            dst1_y[x] = src_elem.y;\r
-            dst2_y[x] = src_elem.z;\r
-        }\r
+        src_type src_elem = src_y[x];\r
+        dst0_y[x] = src_elem.x;\r
+        dst1_y[x] = src_elem.y;\r
+        dst2_y[x] = src_elem.z;\r
      }\r
+}\r
  \r
  \r
-    template <>\r
-    __global__ void splitC3_<double>(\r
-            const uchar* src, size_t src_step, int rows, int cols,\r
-            uchar* dst0, size_t dst0_step,\r
-            uchar* dst1, size_t dst1_step,\r
-            uchar* dst2, size_t dst2_step)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const double* src_y = (const double*)(src + y * src_step);\r
-        double* dst0_y = (double*)(dst0 + y * dst0_step);\r
-        double* dst1_y = (double*)(dst1 + y * dst1_step);\r
-        double* dst2_y = (double*)(dst2 + y * dst2_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {\r
-            dst0_y[x] = src_y[3 * x];\r
-            dst1_y[x] = src_y[3 * x + 1];\r
-            dst2_y[x] = src_y[3 * x + 2];\r
-        }\r
-    }\r
+template <>\r
+__global__ void splitC3_<double>(\r
+        const uchar* src, size_t src_step, int rows, int cols,\r
+        uchar* dst0, size_t dst0_step,\r
+        uchar* dst1, size_t dst1_step,\r
+        uchar* dst2, size_t dst2_step)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
+    const double* src_y = (const double*)(src + y * src_step);\r
+    double* dst0_y = (double*)(dst0 + y * dst0_step);\r
+    double* dst1_y = (double*)(dst1 + y * dst1_step);\r
+    double* dst2_y = (double*)(dst2 + y * dst2_step);\r
  \r
-    template <typename T>\r
-    __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,\r
-                            uchar* dst0, size_t dst0_step,\r
-                            uchar* dst1, size_t dst1_step,\r
-                            uchar* dst2, size_t dst2_step,\r
-                            uchar* dst3, size_t dst3_step)\r
+    if (x < cols && y < rows) \r
      {\r
-        typedef typename TypeTraits<T>::type4 src_type;\r
-\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const src_type* src_y = (const src_type*)(src + y * src_step);\r
-        T* dst0_y = (T*)(dst0 + y * dst0_step);\r
-        T* dst1_y = (T*)(dst1 + y * dst1_step);\r
-        T* dst2_y = (T*)(dst2 + y * dst2_step);\r
-        T* dst3_y = (T*)(dst3 + y * dst3_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {\r
-            src_type src_elem = src_y[x];\r
-            dst0_y[x] = src_elem.x;\r
-            dst1_y[x] = src_elem.y;\r
-            dst2_y[x] = src_elem.z;\r
-            dst3_y[x] = src_elem.w;\r
-        }\r
+        dst0_y[x] = src_y[3 * x];\r
+        dst1_y[x] = src_y[3 * x + 1];\r
+        dst2_y[x] = src_y[3 * x + 2];\r
      }\r
+}\r
  \r
  \r
-    template <>\r
-    __global__ void splitC4_<double>(\r
-            const uchar* src, size_t src_step, int rows, int cols,\r
-            uchar* dst0, size_t dst0_step,\r
-            uchar* dst1, size_t dst1_step,\r
-            uchar* dst2, size_t dst2_step,\r
-            uchar* dst3, size_t dst3_step)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        const double2* src_y = (const double2*)(src + y * src_step);\r
-        double* dst0_y = (double*)(dst0 + y * dst0_step);\r
-        double* dst1_y = (double*)(dst1 + y * dst1_step);\r
-        double* dst2_y = (double*)(dst2 + y * dst2_step);\r
-        double* dst3_y = (double*)(dst3 + y * dst3_step);\r
-\r
-        if (x < cols && y < rows) \r
-        {\r
-            double2 src_elem1 = src_y[2 * x];\r
-            double2 src_elem2 = src_y[2 * x + 1];\r
-            dst0_y[x] = src_elem1.x;\r
-            dst1_y[x] = src_elem1.y;\r
-            dst2_y[x] = src_elem2.x;\r
-            dst3_y[x] = src_elem2.y;\r
-        }\r
-    }\r
+template <typename T>\r
+__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,\r
+                        uchar* dst0, size_t dst0_step,\r
+                        uchar* dst1, size_t dst1_step,\r
+                        uchar* dst2, size_t dst2_step,\r
+                        uchar* dst3, size_t dst3_step)\r
+{\r
+    typedef typename TypeTraits<T>::type4 src_type;\r
  \r
-    template <typename T>\r
-    static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
-    {\r
-        dim3 blockDim(32, 8);\r
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
-        splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
-                src.data, src.step, src.rows, src.cols,\r
-                dst[0].data, dst[0].step,\r
-                dst[1].data, dst[1].step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
-    }\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
+    const src_type* src_y = (const src_type*)(src + y * src_step);\r
+    T* dst0_y = (T*)(dst0 + y * dst0_step);\r
+    T* dst1_y = (T*)(dst1 + y * dst1_step);\r
+    T* dst2_y = (T*)(dst2 + y * dst2_step);\r
+    T* dst3_y = (T*)(dst3 + y * dst3_step);\r
  \r
-    template <typename T>\r
-    static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+    if (x < cols && y < rows) \r
      {\r
-        dim3 blockDim(32, 8);\r
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
-        splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
-                src.data, src.step, src.rows, src.cols,\r
-                dst[0].data, dst[0].step,\r
-                dst[1].data, dst[1].step,\r
-                dst[2].data, dst[2].step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
+        src_type src_elem = src_y[x];\r
+        dst0_y[x] = src_elem.x;\r
+        dst1_y[x] = src_elem.y;\r
+        dst2_y[x] = src_elem.z;\r
+        dst3_y[x] = src_elem.w;\r
      }\r
-\r
-\r
-    template <typename T>\r
-    static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+}\r
+\r
+\r
+template <>\r
+__global__ void splitC4_<double>(\r
+        const uchar* src, size_t src_step, int rows, int cols,\r
+        uchar* dst0, size_t dst0_step,\r
+        uchar* dst1, size_t dst1_step,\r
+        uchar* dst2, size_t dst2_step,\r
+        uchar* dst3, size_t dst3_step)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    const double2* src_y = (const double2*)(src + y * src_step);\r
+    double* dst0_y = (double*)(dst0 + y * dst0_step);\r
+    double* dst1_y = (double*)(dst1 + y * dst1_step);\r
+    double* dst2_y = (double*)(dst2 + y * dst2_step);\r
+    double* dst3_y = (double*)(dst3 + y * dst3_step);\r
+\r
+    if (x < cols && y < rows) \r
      {\r
-        dim3 blockDim(32, 8);\r
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
-        splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
-                 src.data, src.step, src.rows, src.cols,\r
-                 dst[0].data, dst[0].step,\r
-                 dst[1].data, dst[1].step,\r
-                 dst[2].data, dst[2].step,\r
-                 dst[3].data, dst[3].step);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall(cudaDeviceSynchronize());\r
+        double2 src_elem1 = src_y[2 * x];\r
+        double2 src_elem2 = src_y[2 * x + 1];\r
+        dst0_y[x] = src_elem1.x;\r
+        dst1_y[x] = src_elem1.y;\r
+        dst2_y[x] = src_elem2.x;\r
+        dst3_y[x] = src_elem2.y;\r
      }\r
-\r
-\r
-    extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst,\r
-                                 int num_channels, size_t elem_size1,\r
-                                 const cudaStream_t& stream)\r
+}\r
+\r
+template <typename T>\r
+static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+{\r
+    dim3 blockDim(32, 8);\r
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+    splitC2_<T><<<gridDim, blockDim, 0, stream>>>(\r
+            src.data, src.step, src.rows, src.cols,\r
+            dst[0].data, dst[0].step,\r
+            dst[1].data, dst[1].step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+template <typename T>\r
+static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+{\r
+    dim3 blockDim(32, 8);\r
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+    splitC3_<T><<<gridDim, blockDim, 0, stream>>>(\r
+            src.data, src.step, src.rows, src.cols,\r
+            dst[0].data, dst[0].step,\r
+            dst[1].data, dst[1].step,\r
+            dst[2].data, dst[2].step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+template <typename T>\r
+static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)\r
+{\r
+    dim3 blockDim(32, 8);\r
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));\r
+    splitC4_<T><<<gridDim, blockDim, 0, stream>>>(\r
+             src.data, src.step, src.rows, src.cols,\r
+             dst[0].data, dst[0].step,\r
+             dst[1].data, dst[1].step,\r
+             dst[2].data, dst[2].step,\r
+             dst[3].data, dst[3].step);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall(cudaDeviceSynchronize());\r
+}\r
+\r
+\r
+void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)\r
+{\r
+    static SplitFunction split_func_tbl[] =\r
      {\r
-        static SplitFunction split_func_tbl[] =\r
-        {\r
-            splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,\r
-            splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,\r
-            splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,\r
-        };\r
+        splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,\r
+        splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,\r
+        splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,\r
+    };\r
  \r
-        size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);\r
-        SplitFunction split_func = split_func_tbl[split_func_id];\r
+    size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);\r
+    SplitFunction split_func = split_func_tbl[split_func_id];\r
  \r
-        if (split_func == 0)\r
-            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
+    if (split_func == 0)\r
+        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);\r
  \r
-        split_func(src, dst, stream);\r
-    }\r
+    split_func(src, dst, stream);\r
+}\r
+\r
+} // namespace split_merge\r
  \r
-}}} // namespace cv::gpu::split_merge\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu

index 3e99ce4..605ee0b 100644 (file)
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -40,23 +40,18 @@
  //\r
  //M*/\r
  \r
-//#include "internal_shared.hpp"\r
-#include "opencv2/gpu/devmem2d.hpp"\r
-#include "safe_call.hpp"\r
-static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }\r
+#include "internal_shared.hpp"\r
  \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-using namespace cv::gpu;\r
+namespace stereobm {\r
  \r
  //////////////////////////////////////////////////////////////////////////////////////////////////\r
-/////////////////////////////////////// Streeo BM ////////////////////////////////////////////////\r
+/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////\r
  //////////////////////////////////////////////////////////////////////////////////////////////////\r
  \r
  #define ROWSperTHREAD 21     // the number of rows a thread will process\r
  \r
-namespace cv { namespace gpu  { namespace bm\r
-{\r
-\r
  #define BLOCK_W 128          // the thread block width (464)\r
  #define N_DISPARITIES 8\r
  \r
@@ -117,7 +112,7 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
      __syncthreads();\r
      ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));\r
  \r
-    int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));\r
+    int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));\r
  \r
      int bestIdx = 0;\r
      for (int i = 0; i < N_DISPARITIES; i++)\r
@@ -252,7 +247,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
          for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )\r
              *ptr = 0xFFFFFFFF;\r
      }*/\r
-    int end_row = min(ROWSperTHREAD, cheight - Y - RADIUS);\r
+    int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);\r
      int y_tex;\r
      int x_tex = X - RADIUS;\r
  \r
@@ -346,7 +341,7 @@ const static kernel_caller_t callers[] =
  };\r
  const int calles_num = sizeof(callers)/sizeof(callers[0]);\r
  \r
-extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)\r
+void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)\r
  {\r
      int winsz2 = winsz >> 1;\r
  \r
@@ -375,7 +370,7 @@ extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, cons
  \r
  texture<unsigned char, 2, cudaReadModeElementType> texForSobel;\r
  \r
-extern "C" __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)\r
+__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)\r
  {\r
      int x = blockDim.x * blockIdx.x + threadIdx.x;\r
      int y = blockDim.y * blockIdx.y + threadIdx.y;\r
@@ -387,12 +382,12 @@ extern "C" __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
                     (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);\r
  \r
  \r
-        conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);\r
+        conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);\r
          output.ptr(y)[x] = conv & 0xFF;\r
      }\r
  }\r
  \r
-extern "C" void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)\r
+void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)\r
  {\r
      cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();\r
      cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );\r
@@ -451,7 +446,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
  \r
  #define RpT (2 * ROWSperTHREAD)  // got experimentally\r
  \r
-extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)\r
+__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)\r
  {\r
      int winsz2 = winsz/2;\r
      int n_dirty_pixels = (winsz2) * 2;\r
@@ -462,7 +457,7 @@ extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float t
  \r
      int x = blockIdx.x * blockDim.x + threadIdx.x;\r
      int beg_row = blockIdx.y * RpT;\r
-    int end_row = min(beg_row + RpT, disp.rows);\r
+    int end_row = ::min(beg_row + RpT, disp.rows);\r
  \r
      if (x < disp.cols)\r
      {\r
@@ -510,7 +505,7 @@ extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float t
      }\r
  }\r
  \r
-extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)\r
+void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)\r
  {\r
      avgTexturenessThreshold *= winsz * winsz;\r
  \r
@@ -537,4 +532,6 @@ extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float
      cudaSafeCall( cudaUnbindTexture (texForTF) );\r
  }\r
  \r
-}}}\r
+} // namespace stereobm\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu

index 9a980cf..ab626ff 100644 (file)
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -44,484 +44,489 @@
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  #include "opencv2/gpu/device/limits.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace stereobp {\r
  \r
-namespace cv { namespace gpu { namespace bp\r
-{\r
  ///////////////////////////////////////////////////////////////\r
  /////////////////////// load constants ////////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    __constant__ int   cndisp;\r
-    __constant__ float cmax_data_term;\r
-    __constant__ float cdata_weight;\r
-    __constant__ float cmax_disc_term;\r
-    __constant__ float cdisc_single_jump;\r
+__constant__ int   cndisp;\r
+__constant__ float cmax_data_term;\r
+__constant__ float cdata_weight;\r
+__constant__ float cmax_disc_term;\r
+__constant__ float cdisc_single_jump;\r
  \r
-    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
-    }\r
+void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
+}\r
  \r
  ///////////////////////////////////////////////////////////////\r
  ////////////////////////// comp data //////////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <int cn> struct PixDiff;\r
-    template <> struct PixDiff<1>\r
+template <int cn> struct PixDiff;\r
+template <> struct PixDiff<1>\r
+{\r
+    __device__ __forceinline__ PixDiff(const uchar* ls)\r
      {\r
-        __device__ __forceinline__ PixDiff(const uchar* ls)\r
-        {\r
-            l = *ls;\r
-        }\r
-        __device__ __forceinline__ float operator()(const uchar* rs) const\r
-        {\r
-            return abs((int)l - *rs);\r
-        }\r
-        uchar l;\r
-    };\r
-    template <> struct PixDiff<3>\r
+        l = *ls;\r
+    }\r
+    __device__ __forceinline__ float operator()(const uchar* rs) const\r
      {\r
-        __device__ __forceinline__ PixDiff(const uchar* ls)\r
-        {\r
-            l = *((uchar3*)ls);\r
-        }\r
-        __device__ __forceinline__ float operator()(const uchar* rs) const\r
-        {\r
-            const float tr = 0.299f;\r
-            const float tg = 0.587f;\r
-            const float tb = 0.114f;\r
+        return ::abs((int)l - *rs);\r
+    }\r
+    uchar l;\r
+};\r
+template <> struct PixDiff<3>\r
+{\r
+    __device__ __forceinline__ PixDiff(const uchar* ls)\r
+    {\r
+        l = *((uchar3*)ls);\r
+    }\r
+    __device__ __forceinline__ float operator()(const uchar* rs) const\r
+    {\r
+        const float tr = 0.299f;\r
+        const float tg = 0.587f;\r
+        const float tb = 0.114f;\r
  \r
-            float val  = tb * abs((int)l.x - rs[0]);\r
-                  val += tg * abs((int)l.y - rs[1]);\r
-                  val += tr * abs((int)l.z - rs[2]);\r
+        float val  = tb * ::abs((int)l.x - rs[0]);\r
+              val += tg * ::abs((int)l.y - rs[1]);\r
+              val += tr * ::abs((int)l.z - rs[2]);\r
  \r
-            return val;\r
-        }\r
-        uchar3 l;\r
-    };\r
-    template <> struct PixDiff<4>\r
+        return val;\r
+    }\r
+    uchar3 l;\r
+};\r
+template <> struct PixDiff<4>\r
+{\r
+    __device__ __forceinline__ PixDiff(const uchar* ls)\r
      {\r
-        __device__ __forceinline__ PixDiff(const uchar* ls)\r
-        {\r
-            l = *((uchar4*)ls);\r
-        }\r
-        __device__ __forceinline__ float operator()(const uchar* rs) const\r
-        {\r
-            const float tr = 0.299f;\r
-            const float tg = 0.587f;\r
-            const float tb = 0.114f;\r
+        l = *((uchar4*)ls);\r
+    }\r
+    __device__ __forceinline__ float operator()(const uchar* rs) const\r
+    {\r
+        const float tr = 0.299f;\r
+        const float tg = 0.587f;\r
+        const float tb = 0.114f;\r
  \r
-            uchar4 r = *((uchar4*)rs);\r
+        uchar4 r = *((uchar4*)rs);\r
  \r
-            float val  = tb * abs((int)l.x - r.x);\r
-                  val += tg * abs((int)l.y - r.y);\r
-                  val += tr * abs((int)l.z - r.z);\r
+        float val  = tb * ::abs((int)l.x - r.x);\r
+              val += tg * ::abs((int)l.y - r.y);\r
+              val += tr * ::abs((int)l.z - r.z);\r
  \r
-            return val;\r
-        }\r
-        uchar4 l;\r
-    };\r
+        return val;\r
+    }\r
+    uchar4 l;\r
+};\r
  \r
-    template <int cn, typename D>\r
-    __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)\r
+template <int cn, typename D>\r
+__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)\r
      {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        const uchar* ls = left.ptr(y) + x * cn;\r
+        const PixDiff<cn> pixDiff(ls);\r
+        const uchar* rs = right.ptr(y) + x * cn;\r
  \r
-        if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)\r
-        {\r
-            const uchar* ls = left.ptr(y) + x * cn;\r
-            const PixDiff<cn> pixDiff(ls);\r
-            const uchar* rs = right.ptr(y) + x * cn;\r
+        D* ds = data.ptr(y) + x;\r
+        const size_t disp_step = data.step * left.rows;\r
  \r
-            D* ds = data.ptr(y) + x;\r
-            const size_t disp_step = data.step * left.rows;\r
+        for (int disp = 0; disp < cndisp; disp++)\r
+        {\r
+            if (x - disp >= 1)\r
+            {\r
+                float val = pixDiff(rs - disp * cn);\r
  \r
-            for (int disp = 0; disp < cndisp; disp++)\r
+                ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));\r
+            }\r
+            else\r
              {\r
-                if (x - disp >= 1)\r
-                {\r
-                    float val = pixDiff(rs - disp * cn);\r
-\r
-                    ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));\r
-                }\r
-                else\r
-                {\r
-                    ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);\r
-                }\r
+                ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);\r
              }\r
          }\r
      }\r
+}\r
  \r
-    template<typename T, typename D>\r
-    void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
+template<typename T, typename D>\r
+void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
  \r
-    template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(left.cols, threads.x);\r
-        grid.y = divUp(left.rows, threads.y);\r
+    grid.x = divUp(left.cols, threads.x);\r
+    grid.y = divUp(left.rows, threads.y);\r
  \r
-        comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-    template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(left.cols, threads.x);\r
-        grid.y = divUp(left.rows, threads.y);\r
+    grid.x = divUp(left.cols, threads.x);\r
+    grid.y = divUp(left.rows, threads.y);\r
  \r
-        comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(left.cols, threads.x);\r
-        grid.y = divUp(left.rows, threads.y);\r
+    grid.x = divUp(left.cols, threads.x);\r
+    grid.y = divUp(left.rows, threads.y);\r
  \r
-        comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-    template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(left.cols, threads.x);\r
-        grid.y = divUp(left.rows, threads.y);\r
+    grid.x = divUp(left.cols, threads.x);\r
+    grid.y = divUp(left.rows, threads.y);\r
  \r
-        comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(left.cols, threads.x);\r
-        grid.y = divUp(left.rows, threads.y);\r
+    grid.x = divUp(left.cols, threads.x);\r
+    grid.y = divUp(left.rows, threads.y);\r
  \r
-        comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-    template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(left.cols, threads.x);\r
-        grid.y = divUp(left.rows, threads.y);\r
+    grid.x = divUp(left.cols, threads.x);\r
+    grid.y = divUp(left.rows, threads.y);\r
  \r
-        comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
  ///////////////////////////////////////////////////////////////\r
  //////////////////////// data step down ///////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <typename T>\r
-    __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename T>\r
+__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < dst_cols && y < dst_rows)\r
+    if (x < dst_cols && y < dst_rows)\r
+    {\r
+        for (int d = 0; d < cndisp; ++d)\r
          {\r
-            for (int d = 0; d < cndisp; ++d)\r
-            {\r
-                float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];\r
-                      dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];\r
-                      dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];\r
-                      dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];\r
+            float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];\r
+                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];\r
+                  dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];\r
+                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];\r
  \r
-                dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);\r
-            }\r
+            dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);\r
          }\r
      }\r
+}\r
  \r
-    template<typename T>\r
-    void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template<typename T>\r
+void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(dst_cols, threads.x);\r
-        grid.y = divUp(dst_rows, threads.y);\r
+    grid.x = divUp(dst_cols, threads.x);\r
+    grid.y = divUp(dst_rows, threads.y);\r
  \r
-        data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
-    template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
+template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
+template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
  \r
  ///////////////////////////////////////////////////////////////\r
  /////////////////// level up messages  ////////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <typename T>\r
-    __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)\r
-    {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename T>\r
+__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (x < dst_cols && y < dst_rows)\r
-        {\r
-            const size_t dst_disp_step = dst.step * dst_rows;\r
-            const size_t src_disp_step = src.step * src_rows;\r
+    if (x < dst_cols && y < dst_rows)\r
+    {\r
+        const size_t dst_disp_step = dst.step * dst_rows;\r
+        const size_t src_disp_step = src.step * src_rows;\r
  \r
-            T*       dstr = dst.ptr(y  ) + x;\r
-            const T* srcr = src.ptr(y/2) + x/2;\r
+        T*       dstr = dst.ptr(y  ) + x;\r
+        const T* srcr = src.ptr(y/2) + x/2;\r
  \r
-            for (int d = 0; d < cndisp; ++d)\r
-                dstr[d * dst_disp_step] = srcr[d * src_disp_step];\r
-        }\r
+        for (int d = 0; d < cndisp; ++d)\r
+            dstr[d * dst_disp_step] = srcr[d * src_disp_step];\r
      }\r
+}\r
  \r
-    template <typename T>\r
-    void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <typename T>\r
+void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(dst_cols, threads.x);\r
-        grid.y = divUp(dst_rows, threads.y);\r
+    grid.x = divUp(dst_cols, threads.x);\r
+    grid.y = divUp(dst_rows, threads.y);\r
  \r
-        int src_idx = (dst_idx + 1) & 1;\r
+    int src_idx = (dst_idx + 1) & 1;\r
  \r
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);\r
-        cudaSafeCall( cudaGetLastError() );\r
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);\r
-        cudaSafeCall( cudaGetLastError() );\r
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);\r
-        cudaSafeCall( cudaGetLastError() );\r
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
-    template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
+template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
+template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);\r
  \r
  ///////////////////////////////////////////////////////////////\r
  ////////////////////  calc all iterations /////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <typename T>\r
-    __device__ void calc_min_linear_penalty(T* dst, size_t step)\r
+template <typename T>\r
+__device__ void calc_min_linear_penalty(T* dst, size_t step)\r
+{\r
+    float prev = dst[0];\r
+    float cur;\r
+    for (int disp = 1; disp < cndisp; ++disp)\r
      {\r
-        float prev = dst[0];\r
-        float cur;\r
-        for (int disp = 1; disp < cndisp; ++disp)\r
+        prev += cdisc_single_jump;\r
+        cur = dst[step * disp];\r
+        if (prev < cur)\r
          {\r
-            prev += cdisc_single_jump;\r
-            cur = dst[step * disp];\r
-            if (prev < cur)\r
-            {\r
-                cur = prev;\r
-                dst[step * disp] = saturate_cast<T>(prev);\r
-            }\r
-            prev = cur;\r
+            cur = prev;\r
+            dst[step * disp] = saturate_cast<T>(prev);\r
          }\r
+        prev = cur;\r
+    }\r
  \r
-        prev = dst[(cndisp - 1) * step];\r
-        for (int disp = cndisp - 2; disp >= 0; disp--)\r
+    prev = dst[(cndisp - 1) * step];\r
+    for (int disp = cndisp - 2; disp >= 0; disp--)\r
+    {\r
+        prev += cdisc_single_jump;\r
+        cur = dst[step * disp];\r
+        if (prev < cur)\r
          {\r
-            prev += cdisc_single_jump;\r
-            cur = dst[step * disp];\r
-            if (prev < cur)\r
-            {\r
-                cur = prev;\r
-                dst[step * disp] = saturate_cast<T>(prev);\r
-            }\r
-            prev = cur;\r
+            cur = prev;\r
+            dst[step * disp] = saturate_cast<T>(prev);\r
          }\r
+        prev = cur;\r
      }\r
+}\r
  \r
-    template <typename T>\r
-    __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)\r
-    {\r
-        float minimum = numeric_limits<float>::max();\r
+template <typename T>\r
+__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)\r
+{\r
+    float minimum = device::numeric_limits<float>::max();\r
  \r
-        for(int i = 0; i < cndisp; ++i)\r
-        {\r
-            float dst_reg  = msg1[msg_disp_step * i];\r
-                  dst_reg += msg2[msg_disp_step * i];\r
-                  dst_reg += msg3[msg_disp_step * i];\r
-                  dst_reg += data[data_disp_step * i];\r
+    for(int i = 0; i < cndisp; ++i)\r
+    {\r
+        float dst_reg  = msg1[msg_disp_step * i];\r
+              dst_reg += msg2[msg_disp_step * i];\r
+              dst_reg += msg3[msg_disp_step * i];\r
+              dst_reg += data[data_disp_step * i];\r
  \r
-            if (dst_reg < minimum)\r
-                minimum = dst_reg;\r
+        if (dst_reg < minimum)\r
+            minimum = dst_reg;\r
  \r
-            dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);\r
-        }\r
+        dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);\r
+    }\r
  \r
-        calc_min_linear_penalty(dst, msg_disp_step);\r
+    calc_min_linear_penalty(dst, msg_disp_step);\r
  \r
-        minimum += cmax_disc_term;\r
+    minimum += cmax_disc_term;\r
  \r
-        float sum = 0;\r
-        for(int i = 0; i < cndisp; ++i)\r
+    float sum = 0;\r
+    for(int i = 0; i < cndisp; ++i)\r
+    {\r
+        float dst_reg = dst[msg_disp_step * i];\r
+        if (dst_reg > minimum)\r
          {\r
-            float dst_reg = dst[msg_disp_step * i];\r
-            if (dst_reg > minimum)\r
-            {\r
-                dst_reg = minimum;\r
-                dst[msg_disp_step * i] = saturate_cast<T>(minimum);\r
-            }\r
-            sum += dst_reg;\r
+            dst_reg = minimum;\r
+            dst[msg_disp_step * i] = saturate_cast<T>(minimum);\r
          }\r
-        sum /= cndisp;\r
-\r
-        for(int i = 0; i < cndisp; ++i)\r
-            dst[msg_disp_step * i] -= sum;\r
+        sum += dst_reg;\r
      }\r
+    sum /= cndisp;\r
  \r
-    template <typename T>\r
-    __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)\r
-    {\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
+    for(int i = 0; i < cndisp; ++i)\r
+        dst[msg_disp_step * i] -= sum;\r
+}\r
  \r
-        if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))\r
-        {\r
-            T* us = u.ptr(y) + x;\r
-            T* ds = d + y * u.step + x;\r
-            T* ls = l + y * u.step + x;\r
-            T* rs = r + y * u.step + x;\r
-            const T* dt = data.ptr(y) + x;\r
-\r
-            size_t msg_disp_step = u.step * rows;\r
-            size_t data_disp_step = data.step * rows;\r
-\r
-            message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);\r
-            message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);\r
-            message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);\r
-            message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);\r
-        }\r
-    }\r
+template <typename T>\r
+__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)\r
+{\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);\r
  \r
-    template <typename T>\r
-    void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,\r
-        const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)\r
+    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))\r
      {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+        T* us = u.ptr(y) + x;\r
+        T* ds = d + y * u.step + x;\r
+        T* ls = l + y * u.step + x;\r
+        T* rs = r + y * u.step + x;\r
+        const T* dt = data.ptr(y) + x;\r
+\r
+        size_t msg_disp_step = u.step * rows;\r
+        size_t data_disp_step = data.step * rows;\r
+\r
+        message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);\r
+        message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);\r
+        message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);\r
+        message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);\r
+    }\r
+}\r
  \r
-        grid.x = divUp(cols, threads.x << 1);\r
-        grid.y = divUp(rows, threads.y);\r
+template <typename T>\r
+void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,\r
+    const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        for(int t = 0; t < iters; ++t)\r
-        {\r
-            one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);\r
-            cudaSafeCall( cudaGetLastError() );\r
+    grid.x = divUp(cols, threads.x << 1);\r
+    grid.y = divUp(rows, threads.y);\r
  \r
-            if (stream == 0)\r
-                cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
+    for(int t = 0; t < iters; ++t)\r
+    {\r
+        one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);\r
+        cudaSafeCall( cudaGetLastError() );\r
+\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
      }\r
+}\r
  \r
-    template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
-    template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
+template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
+template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);\r
  \r
  ///////////////////////////////////////////////////////////////\r
  /////////////////////////// output ////////////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <typename T>\r
-    __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,\r
-        DevMem2D_<short> disp)\r
+template <typename T>\r
+__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,\r
+    DevMem2D_<short> disp)\r
+{\r
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)\r
      {\r
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        const T* us = u.ptr(y + 1) + x;\r
+        const T* ds = d + (y - 1) * u.step + x;\r
+        const T* ls = l + y * u.step + (x + 1);\r
+        const T* rs = r + y * u.step + (x - 1);\r
+        const T* dt = data + y * u.step + x;\r
  \r
-        if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)\r
-        {\r
-            const T* us = u.ptr(y + 1) + x;\r
-            const T* ds = d + (y - 1) * u.step + x;\r
-            const T* ls = l + y * u.step + (x + 1);\r
-            const T* rs = r + y * u.step + (x - 1);\r
-            const T* dt = data + y * u.step + x;\r
+        size_t disp_step = disp.rows * u.step;\r
  \r
-            size_t disp_step = disp.rows * u.step;\r
+        int best = 0;\r
+        float best_val = numeric_limits<float>::max();\r
+        for (int d = 0; d < cndisp; ++d)\r
+        {\r
+            float val  = us[d * disp_step];\r
+                  val += ds[d * disp_step];\r
+                  val += ls[d * disp_step];\r
+                  val += rs[d * disp_step];\r
+                  val += dt[d * disp_step];\r
  \r
-            int best = 0;\r
-            float best_val = numeric_limits<float>::max();\r
-            for (int d = 0; d < cndisp; ++d)\r
+            if (val < best_val)\r
              {\r
-                float val  = us[d * disp_step];\r
-                      val += ds[d * disp_step];\r
-                      val += ls[d * disp_step];\r
-                      val += rs[d * disp_step];\r
-                      val += dt[d * disp_step];\r
-\r
-                if (val < best_val)\r
-                {\r
-                    best_val = val;\r
-                    best = d;\r
-                }\r
+                best_val = val;\r
+                best = d;\r
              }\r
-\r
-            disp.ptr(y)[x] = saturate_cast<short>(best);\r
          }\r
+\r
+        disp.ptr(y)[x] = saturate_cast<short>(best);\r
      }\r
+}\r
  \r
-    template <typename T>\r
-    void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,\r
-        const DevMem2D_<short>& disp, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <typename T>\r
+void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,\r
+    const DevMem2D_<short>& disp, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(disp.cols, threads.x);\r
-        grid.y = divUp(disp.rows, threads.y);\r
+    grid.x = divUp(disp.cols, threads.x);\r
+    grid.y = divUp(disp.rows, threads.y);\r
  \r
-        output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
+template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
+\r
+} // namespace stereobp\r
  \r
-    template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
-    template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu

index e43800b..bb8e713 100644 (file)
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -44,555 +44,552 @@
  #include "opencv2/gpu/device/saturate_cast.hpp"\r
  #include "opencv2/gpu/device/limits.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-\r
-namespace cv { namespace gpu { namespace csbp\r
-{  \r
+namespace stereocsbp {\r
  \r
  ///////////////////////////////////////////////////////////////\r
  /////////////////////// load constants ////////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    __constant__ int cndisp;\r
+__constant__ int cndisp;\r
  \r
-    __constant__ float cmax_data_term;\r
-    __constant__ float cdata_weight;\r
-    __constant__ float cmax_disc_term;\r
-    __constant__ float cdisc_single_jump;\r
+__constant__ float cmax_data_term;\r
+__constant__ float cdata_weight;\r
+__constant__ float cmax_disc_term;\r
+__constant__ float cdisc_single_jump;\r
  \r
-    __constant__ int cth;\r
+__constant__ int cth;\r
  \r
-    __constant__ size_t cimg_step;\r
-    __constant__ size_t cmsg_step1;\r
-    __constant__ size_t cmsg_step2;\r
-    __constant__ size_t cdisp_step1;\r
-    __constant__ size_t cdisp_step2;\r
+__constant__ size_t cimg_step;\r
+__constant__ size_t cmsg_step1;\r
+__constant__ size_t cmsg_step2;\r
+__constant__ size_t cdisp_step1;\r
+__constant__ size_t cdisp_step2;\r
  \r
-    __constant__ uchar* cleft;\r
-    __constant__ uchar* cright;\r
-    __constant__ uchar* ctemp;\r
+__constant__ uchar* cleft;\r
+__constant__ uchar* cright;\r
+__constant__ uchar* ctemp;\r
  \r
  \r
-    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
-                        const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)\r
-    {\r
-        cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
+void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
+                    const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );\r
  \r
-        cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );\r
  \r
-        cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );\r
  \r
-        cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );\r
  \r
-        cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );\r
-    }\r
+    cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );\r
+}\r
  \r
  ///////////////////////////////////////////////////////////////\r
  /////////////////////// init data cost ////////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <int channels> struct DataCostPerPixel;\r
-    template <> struct DataCostPerPixel<1>\r
+template <int channels> struct DataCostPerPixel;\r
+template <> struct DataCostPerPixel<1>\r
+{\r
+    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
      {\r
-        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
-        {\r
-            return fmin(cdata_weight * abs((int)*left - *right), cdata_weight * cmax_data_term);\r
-        }\r
-    };\r
-    template <> struct DataCostPerPixel<3>\r
+        return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);\r
+    }\r
+};\r
+template <> struct DataCostPerPixel<3>\r
+{\r
+    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
      {\r
-        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
-        {\r
-            float tb = 0.114f * abs((int)left[0] - right[0]);\r
-            float tg = 0.587f * abs((int)left[1] - right[1]);\r
-            float tr = 0.299f * abs((int)left[2] - right[2]);\r
+        float tb = 0.114f * ::abs((int)left[0] - right[0]);\r
+        float tg = 0.587f * ::abs((int)left[1] - right[1]);\r
+        float tr = 0.299f * ::abs((int)left[2] - right[2]);\r
  \r
-            return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
-        }\r
-    };\r
-    template <> struct DataCostPerPixel<4>\r
+        return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
+    }\r
+};\r
+template <> struct DataCostPerPixel<4>\r
+{\r
+    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
      {\r
-        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)\r
-        {\r
-            uchar4 l = *((const uchar4*)left);\r
-            uchar4 r = *((const uchar4*)right);\r
+        uchar4 l = *((const uchar4*)left);\r
+        uchar4 r = *((const uchar4*)right);\r
  \r
-            float tb = 0.114f * abs((int)l.x - r.x);\r
-            float tg = 0.587f * abs((int)l.y - r.y);\r
-            float tr = 0.299f * abs((int)l.z - r.z);\r
+        float tb = 0.114f * ::abs((int)l.x - r.x);\r
+        float tg = 0.587f * ::abs((int)l.y - r.y);\r
+        float tr = 0.299f * ::abs((int)l.z - r.z);\r
  \r
-            return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
-        }\r
-    };\r
+        return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);\r
+    }\r
+};\r
  \r
-    template <typename T>\r
-    __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)\r
+template <typename T>\r
+__global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    if (y < h && x < w)\r
      {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
+        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
+        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
  \r
-        if (y < h && x < w)\r
+        for(int i = 0; i < nr_plane; i++)\r
          {\r
-            T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
-            T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
-            T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
-\r
-            for(int i = 0; i < nr_plane; i++)\r
+            T minimum = device::numeric_limits<T>::max();\r
+            int id = 0;\r
+            for(int d = 0; d < cndisp; d++)\r
              {\r
-                T minimum = numeric_limits<T>::max();\r
-                int id = 0;\r
-                for(int d = 0; d < cndisp; d++)\r
+                T cur = data_cost[d * cdisp_step1];\r
+                if(cur < minimum)\r
                  {\r
-                    T cur = data_cost[d * cdisp_step1];\r
-                    if(cur < minimum)\r
-                    {\r
-                        minimum = cur;\r
-                        id = d;\r
-                    }\r
+                    minimum = cur;\r
+                    id = d;\r
                  }\r
-\r
-                data_cost_selected[i  * cdisp_step1] = minimum;\r
-                selected_disparity[i  * cdisp_step1] = id;\r
-                data_cost         [id * cdisp_step1] = numeric_limits<T>::max();\r
              }\r
+\r
+            data_cost_selected[i  * cdisp_step1] = minimum;\r
+            selected_disparity[i  * cdisp_step1] = id;\r
+            data_cost         [id * cdisp_step1] = numeric_limits<T>::max();\r
          }\r
      }\r
+}\r
  \r
  \r
-    template <typename T>\r
-    __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)\r
-    {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename T>\r
+__global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (y < h && x < w)\r
-        {\r
-            T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
-            T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
-            T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+    if (y < h && x < w)\r
+    {\r
+        T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;\r
+        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
+        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
  \r
-            int nr_local_minimum = 0;\r
+        int nr_local_minimum = 0;\r
  \r
-            T prev = data_cost[0 * cdisp_step1];\r
-            T cur  = data_cost[1 * cdisp_step1];\r
-            T next = data_cost[2 * cdisp_step1];\r
+        T prev = data_cost[0 * cdisp_step1];\r
+        T cur  = data_cost[1 * cdisp_step1];\r
+        T next = data_cost[2 * cdisp_step1];\r
  \r
-            for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)\r
+        for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)\r
+        {\r
+            if (cur < prev && cur < next)\r
              {\r
-                if (cur < prev && cur < next)\r
-                {\r
-                    data_cost_selected[nr_local_minimum * cdisp_step1] = cur;\r
-                    selected_disparity[nr_local_minimum * cdisp_step1] = d;\r
+                data_cost_selected[nr_local_minimum * cdisp_step1] = cur;\r
+                selected_disparity[nr_local_minimum * cdisp_step1] = d;\r
  \r
-                    data_cost[d * cdisp_step1] = numeric_limits<T>::max();\r
+                data_cost[d * cdisp_step1] = numeric_limits<T>::max();\r
  \r
-                    nr_local_minimum++;\r
-                }\r
-                prev = cur;\r
-                cur = next;\r
-                next = data_cost[(d + 1) * cdisp_step1];\r
+                nr_local_minimum++;\r
              }\r
+            prev = cur;\r
+            cur = next;\r
+            next = data_cost[(d + 1) * cdisp_step1];\r
+        }\r
  \r
-            for (int i = nr_local_minimum; i < nr_plane; i++)\r
-            {\r
-                T minimum = numeric_limits<T>::max();\r
-                int id = 0;\r
+        for (int i = nr_local_minimum; i < nr_plane; i++)\r
+        {\r
+            T minimum = numeric_limits<T>::max();\r
+            int id = 0;\r
  \r
-                for (int d = 0; d < cndisp; d++)\r
+            for (int d = 0; d < cndisp; d++)\r
+            {\r
+                cur = data_cost[d * cdisp_step1];\r
+                if (cur < minimum)\r
                  {\r
-                    cur = data_cost[d * cdisp_step1];\r
-                    if (cur < minimum)\r
-                    {\r
-                        minimum = cur;\r
-                        id = d;\r
-                    }\r
+                    minimum = cur;\r
+                    id = d;\r
                  }\r
-                data_cost_selected[i * cdisp_step1] = minimum;\r
-                selected_disparity[i * cdisp_step1] = id;\r
-\r
-                data_cost[id * cdisp_step1] = numeric_limits<T>::max();\r
              }\r
+            data_cost_selected[i * cdisp_step1] = minimum;\r
+            selected_disparity[i * cdisp_step1] = id;\r
+\r
+            data_cost[id * cdisp_step1] = numeric_limits<T>::max();\r
          }\r
      }\r
+}\r
  \r
-    template <typename T, int channels>\r
-    __global__ void init_data_cost(int h, int w, int level)\r
-    {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename T, int channels>\r
+__global__ void init_data_cost(int h, int w, int level)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (y < h && x < w)\r
-        {\r
-            int y0 = y << level;\r
-            int yt = (y + 1) << level;\r
+    if (y < h && x < w)\r
+    {\r
+        int y0 = y << level;\r
+        int yt = (y + 1) << level;\r
  \r
-            int x0 = x << level;\r
-            int xt = (x + 1) << level;\r
+        int x0 = x << level;\r
+        int xt = (x + 1) << level;\r
  \r
-            T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
+        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;\r
  \r
-            for(int d = 0; d < cndisp; ++d)\r
+        for(int d = 0; d < cndisp; ++d)\r
+        {\r
+            float val = 0.0f;\r
+            for(int yi = y0; yi < yt; yi++)\r
              {\r
-                float val = 0.0f;\r
-                for(int yi = y0; yi < yt; yi++)\r
+                for(int xi = x0; xi < xt; xi++)\r
                  {\r
-                    for(int xi = x0; xi < xt; xi++)\r
+                    int xr = xi - d;\r
+                    if(d < cth || xr < 0)\r
+                        val += cdata_weight * cmax_data_term;\r
+                    else\r
                      {\r
-                        int xr = xi - d;\r
-                        if(d < cth || xr < 0)\r
-                            val += cdata_weight * cmax_data_term;\r
-                        else\r
-                        {\r
-                            const uchar* lle = cleft + yi * cimg_step + xi * channels;\r
-                            const uchar* lri = cright + yi * cimg_step + xr * channels;\r
-\r
-                            val += DataCostPerPixel<channels>::compute(lle, lri);\r
-                        }\r
+                        const uchar* lle = cleft + yi * cimg_step + xi * channels;\r
+                        const uchar* lri = cright + yi * cimg_step + xr * channels;\r
+\r
+                        val += DataCostPerPixel<channels>::compute(lle, lri);\r
                      }\r
                  }\r
-                data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
              }\r
+            data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
          }\r
      }\r
+}\r
  \r
-    template <typename T, int winsz, int channels>\r
-    __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)\r
-    {\r
-        int x_out = blockIdx.x;\r
-        int y_out = blockIdx.y % h;\r
-        int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
+template <typename T, int winsz, int channels>\r
+__global__ void init_data_cost_reduce(int level, int rows, int cols, int h)\r
+{\r
+    int x_out = blockIdx.x;\r
+    int y_out = blockIdx.y % h;\r
+    int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
  \r
-        int tid = threadIdx.x;\r
+    int tid = threadIdx.x;\r
  \r
-        if (d < cndisp)\r
-        {\r
-            int x0 = x_out << level;\r
-            int y0 = y_out << level;\r
+    if (d < cndisp)\r
+    {\r
+        int x0 = x_out << level;\r
+        int y0 = y_out << level;\r
  \r
-            int len = min(y0 + winsz, rows) - y0;\r
+        int len = ::min(y0 + winsz, rows) - y0;\r
  \r
-            float val = 0.0f;\r
-            if (x0 + tid < cols)\r
+        float val = 0.0f;\r
+        if (x0 + tid < cols)\r
+        {\r
+            if (x0 + tid - d < 0 || d < cth)\r
+                val = cdata_weight * cmax_data_term * len;\r
+            else\r
              {\r
-                if (x0 + tid - d < 0 || d < cth)\r
-                    val = cdata_weight * cmax_data_term * len;\r
-                else\r
-                {\r
-                    const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );\r
-                    const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);\r
+                const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );\r
+                const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);\r
  \r
-                    for(int y = 0; y < len; ++y)\r
-                    {\r
-                        val += DataCostPerPixel<channels>::compute(lle, lri);\r
+                for(int y = 0; y < len; ++y)\r
+                {\r
+                    val += DataCostPerPixel<channels>::compute(lle, lri);\r
  \r
-                        lle += cimg_step;\r
-                        lri += cimg_step;\r
-                    }\r
+                    lle += cimg_step;\r
+                    lri += cimg_step;\r
                  }\r
              }\r
+        }\r
  \r
-            extern __shared__ float smem[];\r
-            float* dline = smem + winsz * threadIdx.z;\r
+        extern __shared__ float smem[];\r
+        float* dline = smem + winsz * threadIdx.z;\r
  \r
-            dline[tid] = val;\r
+        dline[tid] = val;\r
  \r
-            __syncthreads();\r
+        __syncthreads();\r
  \r
-            if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
-            if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }\r
+        if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
+        if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }\r
  \r
-                       volatile float* vdline = smem + winsz * threadIdx.z;\r
+               volatile float* vdline = smem + winsz * threadIdx.z;\r
  \r
-            if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
-            if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
-            if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];\r
-            if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];\r
-            if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];\r
-            if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];\r
+        if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
+        if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
+        if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];\r
+        if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];\r
+        if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];\r
+        if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];\r
  \r
-            T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;\r
+        T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;\r
  \r
-            if (tid == 0)\r
-                data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
-        }\r
+        if (tid == 0)\r
+            data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
      }\r
+}\r
  \r
  \r
-    template <typename T>\r
-    void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <typename T>\r
+void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(w, threads.x);\r
-        grid.y = divUp(h, threads.y);\r
+    grid.x = divUp(w, threads.x);\r
+    grid.y = divUp(h, threads.y);\r
  \r
-        switch (channels)\r
-        {\r
-        case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;\r
-        case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;\r
-        case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;\r
-        default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
-        }\r
+    switch (channels)\r
+    {\r
+    case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;\r
+    case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;\r
+    case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;\r
+    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
      }\r
+}\r
  \r
-    template <typename T, int winsz>\r
-    void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)\r
-    {\r
-        const int threadsNum = 256;\r
-        const size_t smem_size = threadsNum * sizeof(float);\r
+template <typename T, int winsz>\r
+void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)\r
+{\r
+    const int threadsNum = 256;\r
+    const size_t smem_size = threadsNum * sizeof(float);\r
  \r
-        dim3 threads(winsz, 1, threadsNum / winsz);\r
-        dim3 grid(w, h, 1);\r
-        grid.y *= divUp(ndisp, threads.z);\r
+    dim3 threads(winsz, 1, threadsNum / winsz);\r
+    dim3 grid(w, h, 1);\r
+    grid.y *= divUp(ndisp, threads.z);\r
  \r
-        switch (channels)\r
-        {\r
-        case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
-        case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
-        case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
-        default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
-        }\r
+    switch (channels)\r
+    {\r
+    case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
+    case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
+    case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;\r
+    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
      }\r
+}\r
  \r
-    template<class T>\r
-    void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,\r
-                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)\r
-    {\r
+template<class T>\r
+void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,\r
+            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)\r
+{\r
  \r
-        typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);\r
+    typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);\r
  \r
-        static const InitDataCostCaller init_data_cost_callers[] =\r
-        {\r
-            init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,\r
-            init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,\r
-            init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>\r
-        };\r
+    static const InitDataCostCaller init_data_cost_callers[] =\r
+    {\r
+        init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,\r
+        init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,\r
+        init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>\r
+    };\r
  \r
-        size_t disp_step = msg_step * h;\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );\r
+    size_t disp_step = msg_step * h;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );\r
  \r
-        init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
  \r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(w, threads.x);\r
-        grid.y = divUp(h, threads.y);\r
+    grid.x = divUp(w, threads.x);\r
+    grid.y = divUp(h, threads.y);\r
  \r
-        if (use_local_init_data_cost == true)\r
-            get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
-        else\r
-            get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
-        \r
-        cudaSafeCall( cudaGetLastError() );\r
+    if (use_local_init_data_cost == true)\r
+        get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
+    else\r
+        get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);\r
+    \r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,\r
-                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
+template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,\r
+            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
  \r
-    template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,\r
-                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
+template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,\r
+            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);\r
  \r
  ///////////////////////////////////////////////////////////////\r
  ////////////////////// compute data cost //////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <typename T, int channels>\r
-    __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)\r
-    {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+template <typename T, int channels>\r
+__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
  \r
-        if (y < h && x < w)\r
-        {\r
-            int y0 = y << level;\r
-            int yt = (y + 1) << level;\r
+    if (y < h && x < w)\r
+    {\r
+        int y0 = y << level;\r
+        int yt = (y + 1) << level;\r
  \r
-            int x0 = x << level;\r
-            int xt = (x + 1) << level;\r
+        int x0 = x << level;\r
+        int xt = (x + 1) << level;\r
  \r
-            const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;\r
-            T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
+        const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;\r
+        T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
  \r
-            for(int d = 0; d < nr_plane; d++)\r
+        for(int d = 0; d < nr_plane; d++)\r
+        {\r
+            float val = 0.0f;\r
+            for(int yi = y0; yi < yt; yi++)\r
              {\r
-                float val = 0.0f;\r
-                for(int yi = y0; yi < yt; yi++)\r
+                for(int xi = x0; xi < xt; xi++)\r
                  {\r
-                    for(int xi = x0; xi < xt; xi++)\r
+                    int sel_disp = selected_disparity[d * cdisp_step2];\r
+                    int xr = xi - sel_disp;\r
+\r
+                    if (xr < 0 || sel_disp < cth)\r
+                        val += cdata_weight * cmax_data_term;\r
+                    else\r
                      {\r
-                        int sel_disp = selected_disparity[d * cdisp_step2];\r
-                        int xr = xi - sel_disp;\r
-\r
-                        if (xr < 0 || sel_disp < cth)\r
-                            val += cdata_weight * cmax_data_term;\r
-                        else\r
-                        {\r
-                            const uchar* left_x = cleft + yi * cimg_step + xi * channels;\r
-                            const uchar* right_x = cright + yi * cimg_step + xr * channels;\r
-\r
-                            val += DataCostPerPixel<channels>::compute(left_x, right_x);\r
-                        }\r
+                        const uchar* left_x = cleft + yi * cimg_step + xi * channels;\r
+                        const uchar* right_x = cright + yi * cimg_step + xr * channels;\r
+\r
+                        val += DataCostPerPixel<channels>::compute(left_x, right_x);\r
                      }\r
                  }\r
-                data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
              }\r
+            data_cost[cdisp_step1 * d] = saturate_cast<T>(val);\r
          }\r
      }\r
+}\r
  \r
-    template <typename T, int winsz, int channels>\r
-    __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)\r
-    {\r
-        int x_out = blockIdx.x;\r
-        int y_out = blockIdx.y % h;\r
-        int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
+template <typename T, int winsz, int channels>\r
+__global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)\r
+{\r
+    int x_out = blockIdx.x;\r
+    int y_out = blockIdx.y % h;\r
+    int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;\r
  \r
-        int tid = threadIdx.x;\r
+    int tid = threadIdx.x;\r
  \r
-        const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;\r
-        T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;\r
+    const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;\r
+    T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;\r
  \r
-        if (d < nr_plane)\r
-        {\r
-            int sel_disp = selected_disparity[d * cdisp_step2];\r
+    if (d < nr_plane)\r
+    {\r
+        int sel_disp = selected_disparity[d * cdisp_step2];\r
  \r
-            int x0 = x_out << level;\r
-            int y0 = y_out << level;\r
+        int x0 = x_out << level;\r
+        int y0 = y_out << level;\r
  \r
-            int len = min(y0 + winsz, rows) - y0;\r
+        int len = ::min(y0 + winsz, rows) - y0;\r
  \r
-            float val = 0.0f;\r
-            if (x0 + tid < cols)\r
+        float val = 0.0f;\r
+        if (x0 + tid < cols)\r
+        {\r
+            if (x0 + tid - sel_disp < 0 || sel_disp < cth)\r
+                val = cdata_weight * cmax_data_term * len;\r
+            else\r
              {\r
-                if (x0 + tid - sel_disp < 0 || sel_disp < cth)\r
-                    val = cdata_weight * cmax_data_term * len;\r
-                else\r
-                {\r
-                    const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );\r
-                    const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);\r
+                const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );\r
+                const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);\r
  \r
-                    for(int y = 0; y < len; ++y)\r
-                    {\r
-                        val += DataCostPerPixel<channels>::compute(lle, lri);\r
+                for(int y = 0; y < len; ++y)\r
+                {\r
+                    val += DataCostPerPixel<channels>::compute(lle, lri);\r
  \r
-                        lle += cimg_step;\r
-                        lri += cimg_step;\r
-                    }\r
+                    lle += cimg_step;\r
+                    lri += cimg_step;\r
                  }\r
              }\r
+        }\r
  \r
-            extern __shared__ float smem[];\r
-            float* dline = smem + winsz * threadIdx.z;\r
+        extern __shared__ float smem[];\r
+        float* dline = smem + winsz * threadIdx.z;\r
  \r
-            dline[tid] = val;\r
+        dline[tid] = val;\r
  \r
-            __syncthreads();\r
+        __syncthreads();\r
  \r
-            if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
-            if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }\r
+        if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }\r
+        if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }\r
  \r
-                       volatile float* vdline = smem + winsz * threadIdx.z;\r
+               volatile float* vdline = smem + winsz * threadIdx.z;\r
  \r
-            if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
-            if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
-            if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];\r
-            if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];\r
-            if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];\r
-            if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];\r
+        if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];\r
+        if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];\r
+        if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];\r
+        if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];\r
+        if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];\r
+        if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];\r
  \r
-            if (tid == 0)\r
-                data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
-        }\r
+        if (tid == 0)\r
+            data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);\r
      }\r
+}\r
  \r
-    template <typename T>\r
-    void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,\r
-                                  int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
-    {\r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+template <typename T>\r
+void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,\r
+                              int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
+{\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(w, threads.x);\r
-        grid.y = divUp(h, threads.y);\r
+    grid.x = divUp(w, threads.x);\r
+    grid.y = divUp(h, threads.y);\r
  \r
-        switch(channels)\r
-        {\r
-        case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
-        case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
-        case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
-        default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
-        }\r
+    switch(channels)\r
+    {\r
+    case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
+    case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
+    case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;\r
+    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
      }\r
+}\r
  \r
-    template <typename T, int winsz>\r
-    void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
-                                  int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
-    {\r
-        const int threadsNum = 256;\r
-        const size_t smem_size = threadsNum * sizeof(float);\r
+template <typename T, int winsz>\r
+void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
+                              int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)\r
+{\r
+    const int threadsNum = 256;\r
+    const size_t smem_size = threadsNum * sizeof(float);\r
  \r
-        dim3 threads(winsz, 1, threadsNum / winsz);\r
-        dim3 grid(w, h, 1);\r
-        grid.y *= divUp(nr_plane, threads.z);\r
+    dim3 threads(winsz, 1, threadsNum / winsz);\r
+    dim3 grid(w, h, 1);\r
+    grid.y *= divUp(nr_plane, threads.z);\r
  \r
-        switch (channels)\r
-        {\r
-        case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
-        case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
-        case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
-        default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
-        }\r
+    switch (channels)\r
+    {\r
+    case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
+    case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
+    case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;\r
+    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);\r
      }\r
+}\r
  \r
-    template<class T>\r
-    void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,\r
-                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)\r
+template<class T>\r
+void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,\r
+                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)\r
+{\r
+    typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
+        int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);\r
+\r
+    static const ComputeDataCostCaller callers[] =\r
      {\r
-        typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,\r
-            int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);\r
+        compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,\r
+        compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,\r
+        compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>\r
+    };\r
  \r
-        static const ComputeDataCostCaller callers[] =\r
-        {\r
-            compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,\r
-            compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,\r
-            compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>\r
-        };\r
-\r
-        size_t disp_step1 = msg_step1 * h;\r
-        size_t disp_step2 = msg_step2 * h2;\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step1,  sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,  &msg_step2,  sizeof(size_t)) );\r
-\r
-        callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    size_t disp_step1 = msg_step1 * h;\r
+    size_t disp_step2 = msg_step2 * h2;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step1,  sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,  &msg_step2,  sizeof(size_t)) );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,\r
-                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
+template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,\r
+                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
  \r
-    template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,\r
-                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
+template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,\r
+                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);\r
       \r
  \r
  ///////////////////////////////////////////////////////////////\r
@@ -600,229 +597,229 @@ namespace cv { namespace gpu { namespace csbp
  ///////////////////////////////////////////////////////////////\r
  \r
   \r
-     template <typename T>\r
-    __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,\r
-                                                 const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
-                                                 T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,\r
-                                                 const T* data_cost_cur, const T* disparity_selected_cur,\r
-                                                 int nr_plane, int nr_plane2)\r
+ template <typename T>\r
+__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,\r
+                                             const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
+                                             T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,\r
+                                             const T* data_cost_cur, const T* disparity_selected_cur,\r
+                                             int nr_plane, int nr_plane2)\r
+{\r
+    for(int i = 0; i < nr_plane; i++)\r
      {\r
-        for(int i = 0; i < nr_plane; i++)\r
+        T minimum = numeric_limits<T>::max();\r
+        int id = 0;\r
+        for(int j = 0; j < nr_plane2; j++)\r
          {\r
-            T minimum = numeric_limits<T>::max();\r
-            int id = 0;\r
-            for(int j = 0; j < nr_plane2; j++)\r
+            T cur = data_cost_new[j * cdisp_step1];\r
+            if(cur < minimum)\r
              {\r
-                T cur = data_cost_new[j * cdisp_step1];\r
-                if(cur < minimum)\r
-                {\r
-                    minimum = cur;\r
-                    id = j;\r
-                }\r
+                minimum = cur;\r
+                id = j;\r
              }\r
+        }\r
  \r
-            data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];\r
-            disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];\r
+        data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];\r
+        disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];\r
  \r
-            u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];\r
-            d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];\r
-            l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];\r
-            r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];\r
+        u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];\r
+        d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];\r
+        l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];\r
+        r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];\r
  \r
-            data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();\r
-        }\r
+        data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();\r
      }\r
-\r
-    template <typename T>\r
-    __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,\r
-                                 const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,\r
-                                 T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
-                                 T* data_cost_selected_, const T* data_cost_,\r
-                                 int h, int w, int nr_plane, int h2, int w2, int nr_plane2)\r
+}\r
+\r
+template <typename T>\r
+__global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,\r
+                             const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,\r
+                             T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
+                             T* data_cost_selected_, const T* data_cost_,\r
+                             int h, int w, int nr_plane, int h2, int w2, int nr_plane2)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    if (y < h && x < w)\r
      {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-\r
-        if (y < h && x < w)\r
-        {\r
-            const T* u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2;\r
-            const T* d_cur = d_cur_ + max(0, y/2 - 1)    * cmsg_step2 + x/2;\r
-            const T* l_cur = l_cur_ + y/2                * cmsg_step2 + min(w2-1, x/2 + 1);\r
-            const T* r_cur = r_cur_ + y/2                * cmsg_step2 + max(0, x/2 - 1);\r
+        const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;\r
+        const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step2 + x/2;\r
+        const T* l_cur = l_cur_ + y/2                  * cmsg_step2 + ::min(w2-1, x/2 + 1);\r
+        const T* r_cur = r_cur_ + y/2                  * cmsg_step2 + ::max(0, x/2 - 1);\r
  \r
-            T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;\r
+        T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;\r
  \r
-            const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;\r
-            const T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
+        const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;\r
+        const T* data_cost = data_cost_ + y * cmsg_step1 + x;\r
  \r
-            for(int d = 0; d < nr_plane2; d++)\r
-            {\r
-                int idx2 = d * cdisp_step2;\r
-\r
-                T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];\r
-                data_cost_new[d * cdisp_step1] = val;\r
-            }\r
-\r
-            T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
-            T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;\r
-\r
-            T* u_new = u_new_ + y * cmsg_step1 + x;\r
-            T* d_new = d_new_ + y * cmsg_step1 + x;\r
-            T* l_new = l_new_ + y * cmsg_step1 + x;\r
-            T* r_new = r_new_ + y * cmsg_step1 + x;\r
-\r
-            u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;\r
-            d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;\r
-            l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;\r
-            r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;\r
+        for(int d = 0; d < nr_plane2; d++)\r
+        {\r
+            int idx2 = d * cdisp_step2;\r
  \r
-            get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,\r
-                                         data_cost_selected, disparity_selected_new, data_cost_new,\r
-                                         data_cost, disparity_selected_cur, nr_plane, nr_plane2);\r
+            T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];\r
+            data_cost_new[d * cdisp_step1] = val;\r
          }\r
-    }\r
-\r
-\r
-    template<class T>\r
-    void init_message(T* u_new, T* d_new, T* l_new, T* r_new,\r
-                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
-                      T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
-                      T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,\r
-                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)\r
-    {\r
  \r
-        size_t disp_step1 = msg_step1 * h;\r
-        size_t disp_step2 = msg_step2 * h2;\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,   &msg_step1, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,   &msg_step2, sizeof(size_t)) );\r
+        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;\r
+        T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;\r
  \r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+        T* u_new = u_new_ + y * cmsg_step1 + x;\r
+        T* d_new = d_new_ + y * cmsg_step1 + x;\r
+        T* l_new = l_new_ + y * cmsg_step1 + x;\r
+        T* r_new = r_new_ + y * cmsg_step1 + x;\r
  \r
-        grid.x = divUp(w, threads.x);\r
-        grid.y = divUp(h, threads.y);\r
+        u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;\r
+        d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;\r
+        l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;\r
+        r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;\r
  \r
-        init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,\r
-                                                   u_cur, d_cur, l_cur, r_cur,\r
-                                                   selected_disp_pyr_new, selected_disp_pyr_cur,\r
-                                                   data_cost_selected, data_cost,\r
-                                                   h, w, nr_plane, h2, w2, nr_plane2);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,\r
+                                     data_cost_selected, disparity_selected_new, data_cost_new,\r
+                                     data_cost, disparity_selected_cur, nr_plane, nr_plane2);\r
      }\r
-\r
-\r
-    template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,\r
-                      const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,\r
-                      short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,\r
-                      short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,\r
-                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
-\r
-    template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,\r
-                      const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,\r
-                      float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,\r
-                      float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,\r
-                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        \r
+}\r
+\r
+\r
+template<class T>\r
+void init_message(T* u_new, T* d_new, T* l_new, T* r_new,\r
+                  const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,\r
+                  T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,\r
+                  T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,\r
+                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)\r
+{\r
+\r
+    size_t disp_step1 = msg_step1 * h;\r
+    size_t disp_step2 = msg_step2 * h2;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,   &msg_step1, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,   &msg_step2, sizeof(size_t)) );\r
+\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
+\r
+    grid.x = divUp(w, threads.x);\r
+    grid.y = divUp(h, threads.y);\r
+\r
+    init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,\r
+                                               u_cur, d_cur, l_cur, r_cur,\r
+                                               selected_disp_pyr_new, selected_disp_pyr_cur,\r
+                                               data_cost_selected, data_cost,\r
+                                               h, w, nr_plane, h2, w2, nr_plane2);\r
+    cudaSafeCall( cudaGetLastError() );\r
+\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+\r
+template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,\r
+                  const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,\r
+                  short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,\r
+                  short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,\r
+                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);\r
+\r
+template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,\r
+                  const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,\r
+                  float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,\r
+                  float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,\r
+                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        \r
  \r
  ///////////////////////////////////////////////////////////////\r
  ////////////////////  calc all iterations /////////////////////\r
  ///////////////////////////////////////////////////////////////\r
  \r
-    template <typename T>\r
-    __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,\r
-                                      const T* dst_disp, const T* src_disp, int nr_plane, T* temp)\r
+template <typename T>\r
+__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,\r
+                                  const T* dst_disp, const T* src_disp, int nr_plane, T* temp)\r
+{\r
+    T minimum = numeric_limits<T>::max();\r
+\r
+    for(int d = 0; d < nr_plane; d++)\r
      {\r
-        T minimum = numeric_limits<T>::max();\r
+        int idx = d * cdisp_step1;\r
+        T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];\r
  \r
-        for(int d = 0; d < nr_plane; d++)\r
-        {\r
-            int idx = d * cdisp_step1;\r
-            T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];\r
+        if(val < minimum)\r
+            minimum = val;\r
  \r
-            if(val < minimum)\r
-                minimum = val;\r
+        msg_dst[idx] = val;\r
+    }\r
  \r
-            msg_dst[idx] = val;\r
-        }\r
+    float sum = 0;\r
+    for(int d = 0; d < nr_plane; d++)\r
+    {\r
+        float cost_min = minimum + cmax_disc_term;\r
+        T src_disp_reg = src_disp[d * cdisp_step1];\r
  \r
-        float sum = 0;\r
-        for(int d = 0; d < nr_plane; d++)\r
-        {\r
-            float cost_min = minimum + cmax_disc_term;\r
-            T src_disp_reg = src_disp[d * cdisp_step1];\r
+        for(int d2 = 0; d2 < nr_plane; d2++)\r
+            cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));\r
  \r
-            for(int d2 = 0; d2 < nr_plane; d2++)\r
-                cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));\r
+        temp[d * cdisp_step1] = saturate_cast<T>(cost_min);\r
+        sum += cost_min;\r
+    }\r
+    sum /= nr_plane;\r
  \r
-            temp[d * cdisp_step1] = saturate_cast<T>(cost_min);\r
-            sum += cost_min;\r
-        }\r
-        sum /= nr_plane;\r
+    for(int d = 0; d < nr_plane; d++)\r
+        msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);\r
+}\r
  \r
-        for(int d = 0; d < nr_plane; d++)\r
-            msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);\r
-    }\r
+template <typename T>\r
+__global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)\r
+{\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+    int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);\r
  \r
-    template <typename T>\r
-    __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)\r
+    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
      {\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
-        int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);\r
-\r
-        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)\r
-        {\r
-            const T* data = data_cost_selected + y * cmsg_step1 + x;\r
+        const T* data = data_cost_selected + y * cmsg_step1 + x;\r
  \r
-            T* u = u_ + y * cmsg_step1 + x;\r
-            T* d = d_ + y * cmsg_step1 + x;\r
-            T* l = l_ + y * cmsg_step1 + x;\r
-            T* r = r_ + y * cmsg_step1 + x;\r
+        T* u = u_ + y * cmsg_step1 + x;\r
+        T* d = d_ + y * cmsg_step1 + x;\r
+        T* l = l_ + y * cmsg_step1 + x;\r
+        T* r = r_ + y * cmsg_step1 + x;\r
  \r
-            const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;\r
+        const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;\r
  \r
-            T* temp = (T*)ctemp + y * cmsg_step1 + x;\r
+        T* temp = (T*)ctemp + y * cmsg_step1 + x;\r
  \r
-            message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);\r
-            message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);\r
-            message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);\r
-            message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);\r
-        }\r
+        message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);\r
+        message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);\r
+        message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);\r
+        message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);\r
      }\r
+}\r
  \r
  \r
-    template<class T>\r
-    void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,\r
-        const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)\r
-    {\r
-        size_t disp_step = msg_step * h;\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );\r
+template<class T>\r
+void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,\r
+    const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)\r
+{\r
+    size_t disp_step = msg_step * h;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );\r
  \r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(w, threads.x << 1);\r
-        grid.y = divUp(h, threads.y);\r
+    grid.x = divUp(w, threads.x << 1);\r
+    grid.y = divUp(h, threads.y);\r
  \r
-        for(int t = 0; t < iters; ++t)\r
-        {\r
-            compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);\r
-            cudaSafeCall( cudaGetLastError() );\r
+    for(int t = 0; t < iters; ++t)\r
+    {\r
+        compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            if (stream == 0)\r
-                cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-    \r
-    template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,\r
-        int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
+        if (stream == 0)\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+    }\r
+};\r
  \r
-    template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, \r
-        int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
+template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,\r
+    int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
+\r
+template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, \r
+    int h, int w, int nr_plane, int iters, cudaStream_t stream);\r
  \r
  \r
  ///////////////////////////////////////////////////////////////\r
@@ -830,66 +827,69 @@ namespace cv { namespace gpu { namespace csbp
  ///////////////////////////////////////////////////////////////\r
  \r
  \r
-    template <typename T>\r
-    __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,\r
-                                 const T* data_cost_selected, const T* disp_selected_pyr,\r
-                                 short* disp, size_t res_step, int cols, int rows, int nr_plane)\r
+template <typename T>\r
+__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,\r
+                             const T* data_cost_selected, const T* disp_selected_pyr,\r
+                             short* disp, size_t res_step, int cols, int rows, int nr_plane)\r
+{\r
+    int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+    int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+    if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)\r
      {\r
-        int x = blockIdx.x * blockDim.x + threadIdx.x;\r
-        int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+        const T* data = data_cost_selected + y * cmsg_step1 + x;\r
+        const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;\r
  \r
-        if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)\r
-        {\r
-            const T* data = data_cost_selected + y * cmsg_step1 + x;\r
-            const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;\r
+        const T* u = u_ + (y+1) * cmsg_step1 + (x+0);\r
+        const T* d = d_ + (y-1) * cmsg_step1 + (x+0);\r
+        const T* l = l_ + (y+0) * cmsg_step1 + (x+1);\r
+        const T* r = r_ + (y+0) * cmsg_step1 + (x-1);\r
  \r
-            const T* u = u_ + (y+1) * cmsg_step1 + (x+0);\r
-            const T* d = d_ + (y-1) * cmsg_step1 + (x+0);\r
-            const T* l = l_ + (y+0) * cmsg_step1 + (x+1);\r
-            const T* r = r_ + (y+0) * cmsg_step1 + (x-1);\r
+        int best = 0;\r
+        T best_val = numeric_limits<T>::max();\r
+        for (int i = 0; i < nr_plane; ++i)\r
+        {\r
+            int idx = i * cdisp_step1;\r
+            T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];\r
  \r
-            int best = 0;\r
-            T best_val = numeric_limits<T>::max();\r
-            for (int i = 0; i < nr_plane; ++i)\r
+            if (val < best_val)\r
              {\r
-                int idx = i * cdisp_step1;\r
-                T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];\r
-\r
-                if (val < best_val)\r
-                {\r
-                    best_val = val;\r
-                    best = saturate_cast<short>(disp_selected[idx]);\r
-                }\r
+                best_val = val;\r
+                best = saturate_cast<short>(disp_selected[idx]);\r
              }\r
-            disp[res_step * y + x] = best;\r
          }\r
+        disp[res_step * y + x] = best;\r
      }\r
+}\r
  \r
-    template<class T>\r
-    void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
-        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)\r
-    {\r
-        size_t disp_step = disp.rows * msg_step;\r
-        cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
-        cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );\r
+template<class T>\r
+void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
+    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)\r
+{\r
+    size_t disp_step = disp.rows * msg_step;\r
+    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );\r
  \r
-        dim3 threads(32, 8, 1);\r
-        dim3 grid(1, 1, 1);\r
+    dim3 threads(32, 8, 1);\r
+    dim3 grid(1, 1, 1);\r
  \r
-        grid.x = divUp(disp.cols, threads.x);\r
-        grid.y = divUp(disp.rows, threads.y);\r
+    grid.x = divUp(disp.cols, threads.x);\r
+    grid.y = divUp(disp.rows, threads.y);\r
  \r
-        compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,\r
-                                                   disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,\r
+                                               disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    if (stream == 0)\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, \r
+    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+\r
+template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,\r
+    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
  \r
-    template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, \r
-        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+} // namespace stereocsbp\r
  \r
-    template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,\r
-        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu

index a6f26fd..afd81a6 100644 (file)
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -52,930 +52,951 @@
  #include "opencv2/gpu/device/functional.hpp"\r
  #include "opencv2/gpu/device/filters.hpp"\r
  \r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace surf {\r
+\r
+////////////////////////////////////////////////////////////////////////\r
+// Global parameters\r
+\r
+// The maximum number of features (before subpixel interpolation) that memory is reserved for.\r
+__constant__ int c_max_candidates;\r
+// The maximum number of features that memory is reserved for.\r
+__constant__ int c_max_features;\r
+// The image size.\r
+__constant__ int c_img_rows;\r
+__constant__ int c_img_cols;\r
+// The number of layers.\r
+__constant__ int c_nOctaveLayers;\r
+// The hessian threshold.\r
+__constant__ float c_hessianThreshold;\r
+\r
+// The current octave.\r
+__constant__ int c_octave;\r
+// The current layer size.\r
+__constant__ int c_layer_rows;\r
+__constant__ int c_layer_cols;\r
+\r
+void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );\r
+}\r
+\r
+void loadOctaveConstants(int octave, int layer_rows, int layer_cols)\r
+{\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );\r
+    cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );\r
+}\r
  \r
-#define CV_PI 3.1415926535897932384626433832795f\r
+////////////////////////////////////////////////////////////////////////\r
+// Integral image texture\r
  \r
-namespace cv { namespace gpu { namespace surf\r
+texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+\r
+void bindImgTex(DevMem2Db img)\r
  {\r
-    ////////////////////////////////////////////////////////////////////////\r
-    // Global parameters\r
-\r
-    // The maximum number of features (before subpixel interpolation) that memory is reserved for.\r
-    __constant__ int c_max_candidates;\r
-    // The maximum number of features that memory is reserved for.\r
-    __constant__ int c_max_features;\r
-    // The image size.\r
-    __constant__ int c_img_rows;\r
-    __constant__ int c_img_cols;\r
-    // The number of layers.\r
-    __constant__ int c_nOctaveLayers;\r
-    // The hessian threshold.\r
-    __constant__ float c_hessianThreshold;\r
-\r
-    // The current octave.\r
-    __constant__ int c_octave;\r
-    // The current layer size.\r
-    __constant__ int c_layer_rows;\r
-    __constant__ int c_layer_cols;\r
-\r
-    ////////////////////////////////////////////////////////////////////////\r
-    // Integral image texture\r
-\r
-    texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
-    texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
-\r
-    template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)\r
-    {\r
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
-        typedef double real_t;        \r
-        #else\r
-        typedef float  real_t;\r
-        #endif\r
+    bindTexture(&imgTex, img);\r
+}\r
+void bindSumTex(DevMem2D_<uint> sum)\r
+{\r
+    bindTexture(&sumTex, sum);\r
+}\r
+void bindMaskSumTex(DevMem2D_<uint> maskSum)\r
+{\r
+    bindTexture(&maskSumTex, maskSum);\r
+}\r
  \r
-        float ratio = (float)newSize / oldSize;\r
-        \r
-        real_t d = 0;\r
+template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)\r
+{\r
+#if __CUDA_ARCH__ >= 200\r
+    typedef double real_t;        \r
+#else\r
+    typedef float  real_t;\r
+#endif\r
  \r
-        #pragma unroll\r
-        for (int k = 0; k < N; ++k)\r
-        {\r
-            int dx1 = __float2int_rn(ratio * src[k][0]);\r
-            int dy1 = __float2int_rn(ratio * src[k][1]);\r
-            int dx2 = __float2int_rn(ratio * src[k][2]);\r
-            int dy2 = __float2int_rn(ratio * src[k][3]);\r
-\r
-            real_t t = 0;\r
-            t += tex2D(sumTex, x + dx1, y + dy1);\r
-            t -= tex2D(sumTex, x + dx1, y + dy2);\r
-            t -= tex2D(sumTex, x + dx2, y + dy1);\r
-            t += tex2D(sumTex, x + dx2, y + dy2);\r
-\r
-            d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));\r
-        }\r
+    float ratio = (float)newSize / oldSize;\r
+    \r
+    real_t d = 0;\r
  \r
-        return (float)d;\r
+    #pragma unroll\r
+    for (int k = 0; k < N; ++k)\r
+    {\r
+        int dx1 = __float2int_rn(ratio * src[k][0]);\r
+        int dy1 = __float2int_rn(ratio * src[k][1]);\r
+        int dx2 = __float2int_rn(ratio * src[k][2]);\r
+        int dy2 = __float2int_rn(ratio * src[k][3]);\r
+\r
+        real_t t = 0;\r
+        t += tex2D(sumTex, x + dx1, y + dy1);\r
+        t -= tex2D(sumTex, x + dx1, y + dy2);\r
+        t -= tex2D(sumTex, x + dx2, y + dy1);\r
+        t += tex2D(sumTex, x + dx2, y + dy2);\r
+\r
+        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));\r
      }\r
  \r
-    ////////////////////////////////////////////////////////////////////////\r
-    // Hessian\r
+    return (float)d;\r
+}\r
  \r
-    __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };\r
-    __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };\r
-    __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };\r
+////////////////////////////////////////////////////////////////////////\r
+// Hessian\r
  \r
-    __host__ __device__ __forceinline__ int calcSize(int octave, int layer)\r
-    {\r
-        /* Wavelet size at first layer of first octave. */\r
-        const int HAAR_SIZE0 = 9;\r
+__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };\r
+__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };\r
+__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };\r
+\r
+__host__ __device__ __forceinline__ int calcSize(int octave, int layer)\r
+{\r
+    /* Wavelet size at first layer of first octave. */\r
+    const int HAAR_SIZE0 = 9;\r
  \r
-        /* Wavelet size increment between layers. This should be an even number,\r
-         such that the wavelet sizes in an octave are either all even or all odd.\r
-         This ensures that when looking for the neighbours of a sample, the layers\r
-         above and below are aligned correctly. */\r
-        const int HAAR_SIZE_INC = 6;\r
+    /* Wavelet size increment between layers. This should be an even number,\r
+     such that the wavelet sizes in an octave are either all even or all odd.\r
+     This ensures that when looking for the neighbours of a sample, the layers\r
+     above and below are aligned correctly. */\r
+    const int HAAR_SIZE_INC = 6;\r
  \r
-        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;\r
-    }\r
+    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;\r
+}\r
  \r
-    __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)\r
-    {\r
-        // Determine the indices\r
-        const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);\r
-        const int blockIdx_y = blockIdx.y % gridDim_y;\r
-        const int blockIdx_z = blockIdx.y / gridDim_y;\r
+__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)\r
+{\r
+    // Determine the indices\r
+    const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);\r
+    const int blockIdx_y = blockIdx.y % gridDim_y;\r
+    const int blockIdx_z = blockIdx.y / gridDim_y;\r
  \r
-        const int j = threadIdx.x + blockIdx.x * blockDim.x;\r
-        const int i = threadIdx.y + blockIdx_y * blockDim.y;\r
-        const int layer = blockIdx_z;\r
+    const int j = threadIdx.x + blockIdx.x * blockDim.x;\r
+    const int i = threadIdx.y + blockIdx_y * blockDim.y;\r
+    const int layer = blockIdx_z;\r
  \r
-        const int size = calcSize(c_octave, layer);\r
+    const int size = calcSize(c_octave, layer);\r
  \r
-        const int samples_i = 1 + ((c_img_rows - size) >> c_octave);\r
-        const int samples_j = 1 + ((c_img_cols - size) >> c_octave);\r
+    const int samples_i = 1 + ((c_img_rows - size) >> c_octave);\r
+    const int samples_j = 1 + ((c_img_cols - size) >> c_octave);\r
  \r
-        // Ignore pixels where some of the kernel is outside the image\r
-        const int margin = (size >> 1) >> c_octave;\r
+    // Ignore pixels where some of the kernel is outside the image\r
+    const int margin = (size >> 1) >> c_octave;\r
  \r
-        if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)\r
-        {\r
-            const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);\r
-            const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);\r
-            const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);\r
+    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)\r
+    {\r
+        const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);\r
+        const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);\r
+        const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);\r
  \r
-            det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;\r
-            trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;\r
-        }\r
+        det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;\r
+        trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;\r
      }\r
+}\r
  \r
-    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)\r
-    {\r
-        const int min_size = calcSize(octave, 0);\r
-        const int max_samples_i = 1 + ((img_rows - min_size) >> octave);\r
-        const int max_samples_j = 1 + ((img_cols - min_size) >> octave);\r
+void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)\r
+{\r
+    const int min_size = calcSize(octave, 0);\r
+    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);\r
+    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);\r
  \r
-        dim3 threads(16, 16);\r
+    dim3 threads(16, 16);\r
  \r
-        dim3 grid;\r
-        grid.x = divUp(max_samples_j, threads.x);\r
-        grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);\r
+    dim3 grid;\r
+    grid.x = divUp(max_samples_j, threads.x);\r
+    grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);\r
  \r
-        icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);\r
-        cudaSafeCall( cudaGetLastError() );\r
+    icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-    ////////////////////////////////////////////////////////////////////////\r
-    // NONMAX\r
-    \r
-    struct WithOutMask\r
-    {\r
-        static __device__ __forceinline__ bool check(int, int, int)\r
-        {\r
-            return true;\r
-        }\r
-    };\r
+////////////////////////////////////////////////////////////////////////\r
+// NONMAX\r
  \r
-    __constant__ float c_DM[5] = {0, 0, 9, 9, 1};\r
+__constant__ float c_DM[5] = {0, 0, 9, 9, 1};\r
  \r
-    struct WithMask\r
+struct WithMask\r
+{\r
+    static __device__ bool check(int sum_i, int sum_j, int size)\r
      {\r
-        static __device__ bool check(int sum_i, int sum_j, int size)\r
-        {\r
-            float ratio = (float)size / 9.0f;\r
-            \r
-            float d = 0;\r
+        float ratio = (float)size / 9.0f;\r
+        \r
+        float d = 0;\r
  \r
-            int dx1 = __float2int_rn(ratio * c_DM[0]);\r
-            int dy1 = __float2int_rn(ratio * c_DM[1]);\r
-            int dx2 = __float2int_rn(ratio * c_DM[2]);\r
-            int dy2 = __float2int_rn(ratio * c_DM[3]);\r
+        int dx1 = __float2int_rn(ratio * c_DM[0]);\r
+        int dy1 = __float2int_rn(ratio * c_DM[1]);\r
+        int dx2 = __float2int_rn(ratio * c_DM[2]);\r
+        int dy2 = __float2int_rn(ratio * c_DM[3]);\r
  \r
-            float t = 0;\r
-            t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
-            t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
-            t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
-            t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
+        float t = 0;\r
+        t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
+        t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
+        t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
+        t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
  \r
-            d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
+        d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
  \r
-            return (d >= 0.5f);\r
-        }\r
-    };\r
+        return (d >= 0.5f);\r
+    }\r
+};\r
  \r
-    template <typename Mask>\r
-    __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)\r
-    {\r
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+template <typename Mask>\r
+__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)\r
+{\r
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
  \r
-        extern __shared__ float N9[];\r
+    extern __shared__ float N9[];\r
  \r
-        // The hidx variables are the indices to the hessian buffer.\r
-        const int gridDim_y = gridDim.y / c_nOctaveLayers;\r
-        const int blockIdx_y = blockIdx.y % gridDim_y;\r
-        const int blockIdx_z = blockIdx.y / gridDim_y;\r
+    // The hidx variables are the indices to the hessian buffer.\r
+    const int gridDim_y = gridDim.y / c_nOctaveLayers;\r
+    const int blockIdx_y = blockIdx.y % gridDim_y;\r
+    const int blockIdx_z = blockIdx.y / gridDim_y;\r
  \r
-        const int layer = blockIdx_z + 1;\r
+    const int layer = blockIdx_z + 1;\r
  \r
-        const int size = calcSize(c_octave, layer);\r
+    const int size = calcSize(c_octave, layer);\r
  \r
-        // Ignore pixels without a 3x3x3 neighbourhood in the layer above\r
-        const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;\r
+    // Ignore pixels without a 3x3x3 neighbourhood in the layer above\r
+    const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;\r
  \r
-        const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;\r
-        const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;\r
+    const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;\r
+    const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;\r
  \r
-        // Is this thread within the hessian buffer?\r
-        const int zoff = blockDim.x * blockDim.y;\r
-        const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;\r
-        N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1))[min(max(j, 0), c_img_cols - 1)];\r
-        N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + min(max(i, 0), c_img_rows - 1))[min(max(j, 0), c_img_cols - 1)];\r
-        N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1))[min(max(j, 0), c_img_cols - 1)];\r
-        __syncthreads();\r
+    // Is this thread within the hessian buffer?\r
+    const int zoff = blockDim.x * blockDim.y;\r
+    const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;\r
+    N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
+    N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
+    N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];\r
+    __syncthreads();\r
+\r
+    if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)\r
+    {\r
+        float val0 = N9[localLin];\r
  \r
-        if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)\r
+        if (val0 > c_hessianThreshold)\r
          {\r
-            float val0 = N9[localLin];\r
+            // Coordinates for the start of the wavelet in the sum image. There\r
+            // is some integer division involved, so don't try to simplify this\r
+            // (cancel out sampleStep) without checking the result is the same\r
+            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;\r
+            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;\r
  \r
-            if (val0 > c_hessianThreshold)\r
+            if (Mask::check(sum_i, sum_j, size))\r
              {\r
-                // Coordinates for the start of the wavelet in the sum image. There\r
-                // is some integer division involved, so don't try to simplify this\r
-                // (cancel out sampleStep) without checking the result is the same\r
-                const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;\r
-                const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;\r
-\r
-                if (Mask::check(sum_i, sum_j, size))\r
+                // Check to see if we have a max (in its 26 neighbours)\r
+                const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]\r
+                &&                   val0 > N9[localLin     - blockDim.x - zoff]\r
+                &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]\r
+                &&                   val0 > N9[localLin - 1              - zoff]\r
+                &&                   val0 > N9[localLin                  - zoff]\r
+                &&                   val0 > N9[localLin + 1              - zoff]\r
+                &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]\r
+                &&                   val0 > N9[localLin     + blockDim.x - zoff]\r
+                &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]\r
+\r
+                &&                   val0 > N9[localLin - 1 - blockDim.x]\r
+                &&                   val0 > N9[localLin     - blockDim.x]\r
+                &&                   val0 > N9[localLin + 1 - blockDim.x]\r
+                &&                   val0 > N9[localLin - 1             ]\r
+                &&                   val0 > N9[localLin + 1             ]\r
+                &&                   val0 > N9[localLin - 1 + blockDim.x]\r
+                &&                   val0 > N9[localLin     + blockDim.x]\r
+                &&                   val0 > N9[localLin + 1 + blockDim.x]\r
+\r
+                &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]\r
+                &&                   val0 > N9[localLin     - blockDim.x + zoff]\r
+                &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]\r
+                &&                   val0 > N9[localLin - 1              + zoff]\r
+                &&                   val0 > N9[localLin                  + zoff]\r
+                &&                   val0 > N9[localLin + 1              + zoff]\r
+                &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]\r
+                &&                   val0 > N9[localLin     + blockDim.x + zoff]\r
+                &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]\r
+                ;\r
+\r
+                if(condmax)\r
                  {\r
-                    // Check to see if we have a max (in its 26 neighbours)\r
-                    const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]\r
-                    &&                   val0 > N9[localLin     - blockDim.x - zoff]\r
-                    &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]\r
-                    &&                   val0 > N9[localLin - 1              - zoff]\r
-                    &&                   val0 > N9[localLin                  - zoff]\r
-                    &&                   val0 > N9[localLin + 1              - zoff]\r
-                    &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]\r
-                    &&                   val0 > N9[localLin     + blockDim.x - zoff]\r
-                    &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]\r
-\r
-                    &&                   val0 > N9[localLin - 1 - blockDim.x]\r
-                    &&                   val0 > N9[localLin     - blockDim.x]\r
-                    &&                   val0 > N9[localLin + 1 - blockDim.x]\r
-                    &&                   val0 > N9[localLin - 1             ]\r
-                    &&                   val0 > N9[localLin + 1             ]\r
-                    &&                   val0 > N9[localLin - 1 + blockDim.x]\r
-                    &&                   val0 > N9[localLin     + blockDim.x]\r
-                    &&                   val0 > N9[localLin + 1 + blockDim.x]\r
-\r
-                    &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]\r
-                    &&                   val0 > N9[localLin     - blockDim.x + zoff]\r
-                    &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]\r
-                    &&                   val0 > N9[localLin - 1              + zoff]\r
-                    &&                   val0 > N9[localLin                  + zoff]\r
-                    &&                   val0 > N9[localLin + 1              + zoff]\r
-                    &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]\r
-                    &&                   val0 > N9[localLin     + blockDim.x + zoff]\r
-                    &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]\r
-                    ;\r
-\r
-                    if(condmax)\r
-                    {\r
-                        unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);\r
+                    unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);\r
  \r
-                        if (ind < c_max_candidates)\r
-                        {\r
-                            const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);\r
+                    if (ind < c_max_candidates)\r
+                    {\r
+                        const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);\r
  \r
-                            maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);\r
-                        }\r
+                        maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);\r
                      }\r
                  }\r
              }\r
          }\r
-\r
-        #endif\r
      }\r
  \r
-    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
-        int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)\r
-    {\r
-        const int layer_rows = img_rows >> octave;\r
-        const int layer_cols = img_cols >> octave;\r
+    #endif\r
+}\r
  \r
-        const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;\r
+void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
+    int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)\r
+{\r
+    const int layer_rows = img_rows >> octave;\r
+    const int layer_cols = img_cols >> octave;\r
  \r
-        dim3 threads(16, 16);\r
+    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;\r
  \r
-        dim3 grid;\r
-        grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);\r
-        grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;\r
+    dim3 threads(16, 16);\r
  \r
-        const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);\r
+    dim3 grid;\r
+    grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);\r
+    grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;\r
  \r
-        if (use_mask)\r
-            icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
-        else\r
-            icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
+    const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);\r
  \r
-        cudaSafeCall( cudaGetLastError() );\r
+    if (use_mask)\r
+        icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
+    else\r
+        icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);\r
  \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-    ////////////////////////////////////////////////////////////////////////\r
-    // INTERPOLATION\r
-    \r
-    __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,\r
-        float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,\r
-        unsigned int* featureCounter)\r
-    {\r
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+    cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
+\r
+////////////////////////////////////////////////////////////////////////\r
+// INTERPOLATION\r
  \r
-        const int4 maxPos = maxPosBuffer[blockIdx.x];\r
+__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,\r
+    float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,\r
+    unsigned int* featureCounter)\r
+{\r
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
  \r
-        const int j = maxPos.x - 1 + threadIdx.x;\r
-        const int i = maxPos.y - 1 + threadIdx.y;\r
-        const int layer = maxPos.z - 1 + threadIdx.z;\r
+    const int4 maxPos = maxPosBuffer[blockIdx.x];\r
  \r
-        __shared__ float N9[3][3][3];\r
+    const int j = maxPos.x - 1 + threadIdx.x;\r
+    const int i = maxPos.y - 1 + threadIdx.y;\r
+    const int layer = maxPos.z - 1 + threadIdx.z;\r
  \r
-        N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];\r
-        __syncthreads();\r
+    __shared__ float N9[3][3][3];\r
  \r
-        if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)\r
+    N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];\r
+    __syncthreads();\r
+\r
+    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)\r
+    {\r
+        __shared__ float dD[3];\r
+\r
+        //dx\r
+        dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);\r
+        //dy\r
+        dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);\r
+        //ds\r
+        dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);\r
+\r
+        __shared__ float H[3][3];\r
+\r
+        //dxx\r
+        H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];\r
+        //dxy\r
+        H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);\r
+        //dxs\r
+        H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);\r
+        //dyx = dxy\r
+        H[1][0] = H[0][1];\r
+        //dyy\r
+        H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];\r
+        //dys\r
+        H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);\r
+        //dsx = dxs\r
+        H[2][0] = H[0][2];\r
+        //dsy = dys\r
+        H[2][1] = H[1][2];\r
+        //dss\r
+        H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];\r
+\r
+        __shared__ float x[3];\r
+\r
+        if (solve3x3(H, dD, x))\r
          {\r
-            __shared__ float dD[3];\r
-\r
-            //dx\r
-            dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);\r
-            //dy\r
-            dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);\r
-            //ds\r
-            dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);\r
-\r
-            __shared__ float H[3][3];\r
-\r
-            //dxx\r
-            H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];\r
-            //dxy\r
-            H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);\r
-            //dxs\r
-            H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);\r
-            //dyx = dxy\r
-            H[1][0] = H[0][1];\r
-            //dyy\r
-            H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];\r
-            //dys\r
-            H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);\r
-            //dsx = dxs\r
-            H[2][0] = H[0][2];\r
-            //dsy = dys\r
-            H[2][1] = H[1][2];\r
-            //dss\r
-            H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];\r
-\r
-            __shared__ float x[3];\r
-\r
-            if (solve3x3(H, dD, x))\r
+            if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)\r
              {\r
-                if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)\r
-                {\r
-                    // if the step is within the interpolation region, perform it\r
-                    \r
-                    const int size = calcSize(c_octave, maxPos.z);\r
-\r
-                    const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;\r
-                    const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;\r
-                    \r
-                    const float center_i = sum_i + (float)(size - 1) / 2;\r
-                    const float center_j = sum_j + (float)(size - 1) / 2;\r
-\r
-                    const float px = center_j + x[0] * (1 << c_octave);\r
-                    const float py = center_i + x[1] * (1 << c_octave);\r
-\r
-                    const int ds = size - calcSize(c_octave, maxPos.z - 1);\r
-                    const float psize = roundf(size + x[2] * ds);\r
-\r
-                    /* The sampling intervals and wavelet sized for selecting an orientation\r
-                     and building the keypoint descriptor are defined relative to 's' */\r
-                    const float s = psize * 1.2f / 9.0f;\r
-\r
-                    /* To find the dominant orientation, the gradients in x and y are\r
-                     sampled in a circle of radius 6s using wavelets of size 4s.\r
-                     We ensure the gradient wavelet size is even to ensure the\r
-                     wavelet pattern is balanced and symmetric around its center */\r
-                    const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
-\r
-                    // check when grad_wav_size is too big\r
-                    if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
-                    {\r
-                        // Get a new feature index.\r
-                        unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);\r
-\r
-                        if (ind < c_max_features)\r
-                        {\r
-                            featureX[ind] = px;\r
-                            featureY[ind] = py;\r
-                            featureLaplacian[ind] = maxPos.w;\r
-                            featureSize[ind] = psize;\r
-                            featureHessian[ind] = N9[1][1][1];\r
-                        }\r
-                    } // grad_wav_size check\r
-                } // If the subpixel interpolation worked\r
-            }\r
-        } // If this is thread 0.\r
+                // if the step is within the interpolation region, perform it\r
+                \r
+                const int size = calcSize(c_octave, maxPos.z);\r
  \r
-        #endif\r
-    }\r
+                const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;\r
+                const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;\r
+                \r
+                const float center_i = sum_i + (float)(size - 1) / 2;\r
+                const float center_j = sum_j + (float)(size - 1) / 2;\r
  \r
-    void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, \r
-        float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, \r
-        unsigned int* featureCounter)\r
-    {\r
-        dim3 threads;\r
-        threads.x = 3;\r
-        threads.y = 3;\r
-        threads.z = 3;\r
+                const float px = center_j + x[0] * (1 << c_octave);\r
+                const float py = center_i + x[1] * (1 << c_octave);\r
  \r
-        dim3 grid;\r
-        grid.x = maxCounter;\r
+                const int ds = size - calcSize(c_octave, maxPos.z - 1);\r
+                const float psize = roundf(size + x[2] * ds);\r
  \r
-        icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);\r
-        cudaSafeCall( cudaGetLastError() );\r
+                /* The sampling intervals and wavelet sized for selecting an orientation\r
+                 and building the keypoint descriptor are defined relative to 's' */\r
+                const float s = psize * 1.2f / 9.0f;\r
  \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
+                /* To find the dominant orientation, the gradients in x and y are\r
+                 sampled in a circle of radius 6s using wavelets of size 4s.\r
+                 We ensure the gradient wavelet size is even to ensure the\r
+                 wavelet pattern is balanced and symmetric around its center */\r
+                const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
  \r
-    ////////////////////////////////////////////////////////////////////////\r
-    // Orientation\r
+                // check when grad_wav_size is too big\r
+                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
+                {\r
+                    // Get a new feature index.\r
+                    unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);\r
  \r
-    #define ORI_SEARCH_INC 5\r
-    #define ORI_WIN        60\r
-    #define ORI_SAMPLES    113\r
+                    if (ind < c_max_features)\r
+                    {\r
+                        featureX[ind] = px;\r
+                        featureY[ind] = py;\r
+                        featureLaplacian[ind] = maxPos.w;\r
+                        featureSize[ind] = psize;\r
+                        featureHessian[ind] = N9[1][1][1];\r
+                    }\r
+                } // grad_wav_size check\r
+            } // If the subpixel interpolation worked\r
+        }\r
+    } // If this is thread 0.\r
  \r
-    __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};\r
-    __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};\r
-    __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};\r
-    \r
-    __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
-    __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
+    #endif\r
+}\r
  \r
-    __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)\r
-    {        \r
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, \r
+    float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, \r
+    unsigned int* featureCounter)\r
+{\r
+    dim3 threads;\r
+    threads.x = 3;\r
+    threads.y = 3;\r
+    threads.z = 3;\r
  \r
-        __shared__ float s_X[128];\r
-        __shared__ float s_Y[128];\r
-        __shared__ float s_angle[128];\r
+    dim3 grid;\r
+    grid.x = maxCounter;\r
  \r
-        __shared__ float s_sum[32 * 4];\r
+    icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        /* The sampling intervals and wavelet sized for selecting an orientation\r
-         and building the keypoint descriptor are defined relative to 's' */\r
-        const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;\r
+    cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-        /* To find the dominant orientation, the gradients in x and y are\r
-         sampled in a circle of radius 6s using wavelets of size 4s.\r
-         We ensure the gradient wavelet size is even to ensure the\r
-         wavelet pattern is balanced and symmetric around its center */\r
-        const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
+////////////////////////////////////////////////////////////////////////\r
+// Orientation\r
  \r
-        // check when grad_wav_size is too big\r
-        if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
-        {\r
-            // Calc X, Y, angle and store it to shared memory\r
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+#define ORI_SEARCH_INC 5\r
+#define ORI_WIN        60\r
+#define ORI_SAMPLES    113\r
  \r
-            float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
+__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};\r
+__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};\r
+__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};\r
  \r
-            if (tid < ORI_SAMPLES)\r
-            {\r
-                const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
-                const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
-                const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
+__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
+__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
  \r
-                if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
-                {\r
-                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
-                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
-                \r
-                    angle = atan2f(Y, X);\r
-                    if (angle < 0)\r
-                        angle += 2.0f * CV_PI;\r
-                    angle *= 180.0f / CV_PI;\r
-                }\r
-            }\r
-            s_X[tid] = X;\r
-            s_Y[tid] = Y;\r
-            s_angle[tid] = angle;\r
-            __syncthreads();\r
+__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)\r
+{        \r
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
  \r
-            float bestx = 0, besty = 0, best_mod = 0;\r
+    __shared__ float s_X[128];\r
+    __shared__ float s_Y[128];\r
+    __shared__ float s_angle[128];\r
  \r
-            #pragma unroll\r
-            for (int i = 0; i < 18; ++i)\r
-            {\r
-                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;\r
+    __shared__ float s_sum[32 * 4];\r
  \r
-                float sumx = 0.0f, sumy = 0.0f;\r
-                int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir);\r
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
-                {\r
-                    sumx = s_X[threadIdx.x];\r
-                    sumy = s_Y[threadIdx.x];\r
-                }\r
-                d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);\r
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
-                {\r
-                    sumx += s_X[threadIdx.x + 32];\r
-                    sumy += s_Y[threadIdx.x + 32];\r
-                }\r
-                d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
-                {\r
-                    sumx += s_X[threadIdx.x + 64];\r
-                    sumy += s_Y[threadIdx.x + 64];\r
-                }\r
-                d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);\r
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
-                {\r
-                    sumx += s_X[threadIdx.x + 96];\r
-                    sumy += s_Y[threadIdx.x + 96];\r
-                }\r
+    /* The sampling intervals and wavelet sized for selecting an orientation\r
+     and building the keypoint descriptor are defined relative to 's' */\r
+    const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;\r
  \r
-                float* s_sum_row = s_sum + threadIdx.y * 32;\r
+    /* To find the dominant orientation, the gradients in x and y are\r
+     sampled in a circle of radius 6s using wavelets of size 4s.\r
+     We ensure the gradient wavelet size is even to ensure the\r
+     wavelet pattern is balanced and symmetric around its center */\r
+    const int grad_wav_size = 2 * __float2int_rn(2.0f * s);\r
  \r
-                reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());\r
-                reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());\r
+    // check when grad_wav_size is too big\r
+    if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
+    {\r
+        // Calc X, Y, angle and store it to shared memory\r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
  \r
-                const float temp_mod = sumx * sumx + sumy * sumy;\r
-                if (temp_mod > best_mod)\r
-                {\r
-                    best_mod = temp_mod;\r
-                    bestx = sumx;\r
-                    besty = sumy;\r
-                }\r
-            }\r
+        float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
+\r
+        if (tid < ORI_SAMPLES)\r
+        {\r
+            const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
+            const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
+            const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
  \r
-            if (threadIdx.x == 0)\r
+            if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
              {\r
-                s_X[threadIdx.y] = bestx;\r
-                s_Y[threadIdx.y] = besty;\r
-                s_angle[threadIdx.y] = best_mod;\r
+                X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
+                Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
+            \r
+                angle = atan2f(Y, X);\r
+                if (angle < 0)\r
+                    angle += 2.0f * CV_PI;\r
+                angle *= 180.0f / CV_PI;\r
              }\r
-            __syncthreads();\r
+        }\r
+        s_X[tid] = X;\r
+        s_Y[tid] = Y;\r
+        s_angle[tid] = angle;\r
+        __syncthreads();\r
  \r
-            if (threadIdx.x < 2 && threadIdx.y == 0)\r
-            {\r
-                volatile float* v_x = s_X;\r
-                volatile float* v_y = s_Y;\r
-                volatile float* v_mod = s_angle;\r
+        float bestx = 0, besty = 0, best_mod = 0;\r
  \r
-                bestx = v_x[threadIdx.x];\r
-                besty = v_y[threadIdx.x];\r
-                best_mod = v_mod[threadIdx.x];\r
+        #pragma unroll\r
+        for (int i = 0; i < 18; ++i)\r
+        {\r
+            const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;\r
  \r
-                float temp_mod = v_mod[threadIdx.x + 2];\r
-                if (temp_mod > best_mod)\r
-                {\r
-                    v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];\r
-                    v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];\r
-                    v_mod[threadIdx.x] = best_mod = temp_mod;\r
-                }\r
-                temp_mod = v_mod[threadIdx.x + 1];\r
-                if (temp_mod > best_mod)\r
-                {\r
-                    v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];\r
-                    v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];\r
-                }\r
+            float sumx = 0.0f, sumy = 0.0f;\r
+            int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);\r
+            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+            {\r
+                sumx = s_X[threadIdx.x];\r
+                sumy = s_Y[threadIdx.x];\r
              }\r
-\r
-            if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)\r
+            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);\r
+            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+            {\r
+                sumx += s_X[threadIdx.x + 32];\r
+                sumy += s_Y[threadIdx.x + 32];\r
+            }\r
+            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
+            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+            {\r
+                sumx += s_X[threadIdx.x + 64];\r
+                sumy += s_Y[threadIdx.x + 64];\r
+            }\r
+            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);\r
+            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
              {\r
-                float kp_dir = atan2f(besty, bestx);\r
-                if (kp_dir < 0)\r
-                    kp_dir += 2.0f * CV_PI;\r
-                kp_dir *= 180.0f / CV_PI;\r
+                sumx += s_X[threadIdx.x + 96];\r
+                sumy += s_Y[threadIdx.x + 96];\r
+            }\r
+\r
+            float* s_sum_row = s_sum + threadIdx.y * 32;\r
  \r
-                featureDir[blockIdx.x] = kp_dir;\r
+            device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());\r
+            device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());\r
+\r
+            const float temp_mod = sumx * sumx + sumy * sumy;\r
+            if (temp_mod > best_mod)\r
+            {\r
+                best_mod = temp_mod;\r
+                bestx = sumx;\r
+                besty = sumy;\r
              }\r
          }\r
  \r
-        #endif\r
-    }\r
+        if (threadIdx.x == 0)\r
+        {\r
+            s_X[threadIdx.y] = bestx;\r
+            s_Y[threadIdx.y] = besty;\r
+            s_angle[threadIdx.y] = best_mod;\r
+        }\r
+        __syncthreads();\r
  \r
-    #undef ORI_SEARCH_INC\r
-    #undef ORI_WIN\r
-    #undef ORI_SAMPLES\r
+        if (threadIdx.x < 2 && threadIdx.y == 0)\r
+        {\r
+            volatile float* v_x = s_X;\r
+            volatile float* v_y = s_Y;\r
+            volatile float* v_mod = s_angle;\r
  \r
-    void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) \r
-    {\r
-        dim3 threads;\r
-        threads.x = 32;\r
-        threads.y = 4;\r
+            bestx = v_x[threadIdx.x];\r
+            besty = v_y[threadIdx.x];\r
+            best_mod = v_mod[threadIdx.x];\r
  \r
-        dim3 grid;\r
-        grid.x = nFeatures;\r
+            float temp_mod = v_mod[threadIdx.x + 2];\r
+            if (temp_mod > best_mod)\r
+            {\r
+                v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];\r
+                v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];\r
+                v_mod[threadIdx.x] = best_mod = temp_mod;\r
+            }\r
+            temp_mod = v_mod[threadIdx.x + 1];\r
+            if (temp_mod > best_mod)\r
+            {\r
+                v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];\r
+                v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];\r
+            }\r
+        }\r
  \r
-        icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);\r
-        cudaSafeCall( cudaGetLastError() );\r
+        if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)\r
+        {\r
+            float kp_dir = atan2f(besty, bestx);\r
+            if (kp_dir < 0)\r
+                kp_dir += 2.0f * CV_PI;\r
+            kp_dir *= 180.0f / CV_PI;\r
  \r
-        cudaSafeCall( cudaDeviceSynchronize() );\r
+            featureDir[blockIdx.x] = kp_dir;\r
+        }\r
      }\r
  \r
-    ////////////////////////////////////////////////////////////////////////\r
-    // Descriptors\r
+    #endif\r
+}\r
  \r
-    #define PATCH_SZ 20\r
+#undef ORI_SEARCH_INC\r
+#undef ORI_WIN\r
+#undef ORI_SAMPLES\r
  \r
-    texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
+void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) \r
+{\r
+    dim3 threads;\r
+    threads.x = 32;\r
+    threads.y = 4;\r
  \r
-    __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = \r
-    {\r
-        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, \r
-        8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
-        1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
-        3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
-        5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
-        9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
-        0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
-        0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
-        0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
-        0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
-        0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
-        0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
-        0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
-        0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
-        9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
-        5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
-        3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
-        1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
-        8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
-        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
-    };\r
-\r
-    struct WinReader\r
-    {\r
-        typedef uchar elem_type;\r
+    dim3 grid;\r
+    grid.x = nFeatures;\r
  \r
-        __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : \r
-            centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)\r
-        {\r
-        }\r
+    icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);\r
+    cudaSafeCall( cudaGetLastError() );\r
  \r
-        __device__ __forceinline__ uchar operator ()(int i, int j) const\r
-        {\r
-            float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
-            float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
+    cudaSafeCall( cudaDeviceSynchronize() );\r
+}\r
  \r
-            return tex2D(imgTex, pixel_x, pixel_y);\r
-        }\r
+////////////////////////////////////////////////////////////////////////\r
+// Descriptors\r
+\r
+#define PATCH_SZ 20\r
  \r
-        float centerX; \r
-        float centerY;\r
-        float win_offset; \r
-        float cos_dir; \r
-        float sin_dir;\r
-    };\r
+__constant__ float c_DW[PATCH_SZ * PATCH_SZ] = \r
+{\r
+    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, \r
+    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
+    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
+    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
+    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
+    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
+    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
+    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
+    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
+    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
+    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, \r
+    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, \r
+    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, \r
+    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, \r
+    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, \r
+    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, \r
+    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, \r
+    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, \r
+    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, \r
+    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
+};\r
+\r
+struct WinReader\r
+{\r
+    typedef uchar elem_type;\r
  \r
-    __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], \r
-        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+    __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : \r
+        centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)\r
      {\r
-        __shared__ float s_PATCH[6][6];\r
+    }\r
  \r
-        const float centerX = featureX[blockIdx.x];\r
-        const float centerY = featureY[blockIdx.x];\r
-        const float size = featureSize[blockIdx.x];\r
-        const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);\r
+    __device__ __forceinline__ uchar operator ()(int i, int j) const\r
+    {\r
+        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
+        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
  \r
-        /* The sampling intervals and wavelet sized for selecting an orientation\r
-         and building the keypoint descriptor are defined relative to 's' */\r
-        const float s = size * 1.2f / 9.0f;\r
+        return tex2D(imgTex, pixel_x, pixel_y);\r
+    }\r
  \r
-        /* Extract a window of pixels around the keypoint of size 20s */\r
-        const int win_size = (int)((PATCH_SZ + 1) * s);\r
+    float centerX; \r
+    float centerY;\r
+    float win_offset; \r
+    float cos_dir; \r
+    float sin_dir;\r
+};\r
  \r
-        float sin_dir;\r
-        float cos_dir;\r
-        sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
+__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], \r
+    const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+{\r
+    __shared__ float s_PATCH[6][6];\r
  \r
-        /* Nearest neighbour version (faster) */\r
-        const float win_offset = -(float)(win_size - 1) / 2; \r
+    const float centerX = featureX[blockIdx.x];\r
+    const float centerY = featureY[blockIdx.x];\r
+    const float size = featureSize[blockIdx.x];\r
+    const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);\r
  \r
-        // Compute sampling points\r
-        // since grids are 2D, need to compute xBlock and yBlock indices\r
-        const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4\r
-        const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
-        const int xIndex = xBlock * 5 + threadIdx.x;\r
-        const int yIndex = yBlock * 5 + threadIdx.y;\r
+    /* The sampling intervals and wavelet sized for selecting an orientation\r
+     and building the keypoint descriptor are defined relative to 's' */\r
+    const float s = size * 1.2f / 9.0f;\r
  \r
-        const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;\r
-        const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;\r
+    /* Extract a window of pixels around the keypoint of size 20s */\r
+    const int win_size = (int)((PATCH_SZ + 1) * s);\r
  \r
-        LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));\r
+    float sin_dir;\r
+    float cos_dir;\r
+    sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
  \r
-        s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);\r
+    /* Nearest neighbour version (faster) */\r
+    const float win_offset = -(float)(win_size - 1) / 2; \r
  \r
-        __syncthreads();\r
+    // Compute sampling points\r
+    // since grids are 2D, need to compute xBlock and yBlock indices\r
+    const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4\r
+    const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
+    const int xIndex = xBlock * 5 + threadIdx.x;\r
+    const int yIndex = yBlock * 5 + threadIdx.y;\r
  \r
-        if (threadIdx.x < 5 && threadIdx.y < 5)\r
-        {\r
-            const int tid = threadIdx.y * 5 + threadIdx.x;\r
+    const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;\r
+    const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;\r
  \r
-            const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
+    LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));\r
  \r
-            const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;\r
-            const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;\r
+    s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);\r
  \r
-            s_dx_bin[tid] = vx;\r
-            s_dy_bin[tid] = vy;\r
-        }\r
-    }\r
+    __syncthreads();\r
  \r
-    __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
+    if (threadIdx.x < 5 && threadIdx.y < 5)\r
      {\r
-        // first step is to reduce from 25 to 16\r
-        if (tid < 9) // use 9 threads\r
-        {\r
-            sdata1[tid] += sdata1[tid + 16];\r
-            sdata2[tid] += sdata2[tid + 16];\r
-            sdata3[tid] += sdata3[tid + 16];\r
-            sdata4[tid] += sdata4[tid + 16];\r
-        }\r
+        const int tid = threadIdx.y * 5 + threadIdx.x;\r
  \r
-        // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
-        if (tid < 8)\r
-        {\r
-            sdata1[tid] += sdata1[tid + 8];\r
-            sdata1[tid] += sdata1[tid + 4];\r
-            sdata1[tid] += sdata1[tid + 2];\r
-            sdata1[tid] += sdata1[tid + 1];\r
-\r
-            sdata2[tid] += sdata2[tid + 8];\r
-            sdata2[tid] += sdata2[tid + 4];\r
-            sdata2[tid] += sdata2[tid + 2];\r
-            sdata2[tid] += sdata2[tid + 1];\r
-\r
-            sdata3[tid] += sdata3[tid + 8];\r
-            sdata3[tid] += sdata3[tid + 4];\r
-            sdata3[tid] += sdata3[tid + 2];\r
-            sdata3[tid] += sdata3[tid + 1];\r
-\r
-            sdata4[tid] += sdata4[tid + 8];\r
-            sdata4[tid] += sdata4[tid + 4];\r
-            sdata4[tid] += sdata4[tid + 2];\r
-            sdata4[tid] += sdata4[tid + 1];\r
-        }\r
-    }\r
+        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
  \r
-       __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
-    {\r
-        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
-        __shared__ float sdx[25];\r
-        __shared__ float sdy[25];\r
-        __shared__ float sdxabs[25];\r
-        __shared__ float sdyabs[25];\r
+        const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;\r
+        const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;\r
  \r
-        calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
-        __syncthreads();\r
+        s_dx_bin[tid] = vx;\r
+        s_dy_bin[tid] = vy;\r
+    }\r
+}\r
  \r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
+{\r
+    // first step is to reduce from 25 to 16\r
+    if (tid < 9) // use 9 threads\r
+    {\r
+        sdata1[tid] += sdata1[tid + 16];\r
+        sdata2[tid] += sdata2[tid + 16];\r
+        sdata3[tid] += sdata3[tid + 16];\r
+        sdata4[tid] += sdata4[tid + 16];\r
+    }\r
  \r
-        if (tid < 25)\r
-        {\r
-            sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
-            sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
-            __syncthreads();\r
+    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
+    if (tid < 8)\r
+    {\r
+        sdata1[tid] += sdata1[tid + 8];\r
+        sdata1[tid] += sdata1[tid + 4];\r
+        sdata1[tid] += sdata1[tid + 2];\r
+        sdata1[tid] += sdata1[tid + 1];\r
+\r
+        sdata2[tid] += sdata2[tid + 8];\r
+        sdata2[tid] += sdata2[tid + 4];\r
+        sdata2[tid] += sdata2[tid + 2];\r
+        sdata2[tid] += sdata2[tid + 1];\r
+\r
+        sdata3[tid] += sdata3[tid + 8];\r
+        sdata3[tid] += sdata3[tid + 4];\r
+        sdata3[tid] += sdata3[tid + 2];\r
+        sdata3[tid] += sdata3[tid + 1];\r
+\r
+        sdata4[tid] += sdata4[tid + 8];\r
+        sdata4[tid] += sdata4[tid + 4];\r
+        sdata4[tid] += sdata4[tid + 2];\r
+        sdata4[tid] += sdata4[tid + 1];\r
+    }\r
+}\r
  \r
-            reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
-            __syncthreads();\r
+__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+{\r
+    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+    __shared__ float sdx[25];\r
+    __shared__ float sdy[25];\r
+    __shared__ float sdxabs[25];\r
+    __shared__ float sdyabs[25];\r
  \r
-            float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
+    calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
+    __syncthreads();\r
  \r
-            // write dx, dy, |dx|, |dy|\r
-            if (tid == 0)\r
-            {\r
-                descriptors_block[0] = sdx[0];\r
-                descriptors_block[1] = sdy[0];\r
-                descriptors_block[2] = sdxabs[0];\r
-                descriptors_block[3] = sdyabs[0];\r
-            }\r
-        }\r
-    }\r
+    const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
  \r
-       __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+    if (tid < 25)\r
      {\r
-        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
-        __shared__ float sdx[25];\r
-        __shared__ float sdy[25];\r
-\r
-        // sum (reduce) 5x5 area response\r
-        __shared__ float sd1[25];\r
-        __shared__ float sd2[25];\r
-        __shared__ float sdabs1[25];\r
-        __shared__ float sdabs2[25];\r
+        sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array\r
+        sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array\r
+        __syncthreads();\r
  \r
-        calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
+        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
          __syncthreads();\r
  \r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
  \r
-        if (tid < 25)\r
+        // write dx, dy, |dx|, |dy|\r
+        if (tid == 0)\r
          {\r
-            if (sdy[tid] >= 0)\r
-            {\r
-                sd1[tid] = sdx[tid];\r
-                sdabs1[tid] = fabs(sdx[tid]);\r
-                sd2[tid] = 0;\r
-                sdabs2[tid] = 0;\r
-            }\r
-            else\r
-            {\r
-                sd1[tid] = 0;\r
-                sdabs1[tid] = 0;\r
-                sd2[tid] = sdx[tid];\r
-                sdabs2[tid] = fabs(sdx[tid]);\r
-            }\r
-            __syncthreads();\r
-\r
-            reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
-            __syncthreads();\r
-\r
-            float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
+            descriptors_block[0] = sdx[0];\r
+            descriptors_block[1] = sdy[0];\r
+            descriptors_block[2] = sdxabs[0];\r
+            descriptors_block[3] = sdyabs[0];\r
+        }\r
+    }\r
+}\r
  \r
-            // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
-            if (tid == 0)\r
-            {\r
-                descriptors_block[0] = sd1[0];\r
-                descriptors_block[1] = sdabs1[0];\r
-                descriptors_block[2] = sd2[0];\r
-                descriptors_block[3] = sdabs2[0];\r
-            }\r
-            __syncthreads();\r
+__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
+{\r
+    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
+    __shared__ float sdx[25];\r
+    __shared__ float sdy[25];\r
  \r
-            if (sdx[tid] >= 0)\r
-            {\r
-                sd1[tid] = sdy[tid];\r
-                sdabs1[tid] = fabs(sdy[tid]);\r
-                sd2[tid] = 0;\r
-                sdabs2[tid] = 0;\r
-            }\r
-            else\r
-            {\r
-                sd1[tid] = 0;\r
-                sdabs1[tid] = 0;\r
-                sd2[tid] = sdy[tid];\r
-                sdabs2[tid] = fabs(sdy[tid]);\r
-            }\r
-            __syncthreads();\r
+    // sum (reduce) 5x5 area response\r
+    __shared__ float sd1[25];\r
+    __shared__ float sd2[25];\r
+    __shared__ float sdabs1[25];\r
+    __shared__ float sdabs2[25];\r
  \r
-            reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
-            __syncthreads();\r
+    calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
+    __syncthreads();\r
  \r
-            // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
-            if (tid == 0)\r
-            {\r
-                descriptors_block[4] = sd1[0];\r
-                descriptors_block[5] = sdabs1[0];\r
-                descriptors_block[6] = sd2[0];\r
-                descriptors_block[7] = sdabs2[0];\r
-            }\r
-        }\r
-    }\r
+    const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
  \r
-    template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)\r
+    if (tid < 25)\r
      {\r
-        // no need for thread ID\r
-        float* descriptor_base = descriptors.ptr(blockIdx.x);\r
+        if (sdy[tid] >= 0)\r
+        {\r
+            sd1[tid] = sdx[tid];\r
+            sdabs1[tid] = ::fabs(sdx[tid]);\r
+            sd2[tid] = 0;\r
+            sdabs2[tid] = 0;\r
+        }\r
+        else\r
+        {\r
+            sd1[tid] = 0;\r
+            sdabs1[tid] = 0;\r
+            sd2[tid] = sdx[tid];\r
+            sdabs2[tid] = ::fabs(sdx[tid]);\r
+        }\r
+        __syncthreads();\r
  \r
-        // read in the unnormalized descriptor values (squared)\r
-        __shared__ float sqDesc[BLOCK_DIM_X];\r
-        const float lookup = descriptor_base[threadIdx.x];\r
-        sqDesc[threadIdx.x] = lookup * lookup;\r
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
          __syncthreads();\r
  \r
-        if (BLOCK_DIM_X >= 128)\r
+        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
+\r
+        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
+        if (tid == 0)\r
          {\r
-            if (threadIdx.x < 64)\r
-                sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
-            __syncthreads();\r
+            descriptors_block[0] = sd1[0];\r
+            descriptors_block[1] = sdabs1[0];\r
+            descriptors_block[2] = sd2[0];\r
+            descriptors_block[3] = sdabs2[0];\r
          }\r
+        __syncthreads();\r
  \r
-        // reduction to get total\r
-        if (threadIdx.x < 32)\r
+        if (sdx[tid] >= 0)\r
          {\r
-            volatile float* smem = sqDesc;\r
-\r
-            smem[threadIdx.x] += smem[threadIdx.x + 32];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 16];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 8];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 4];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 2];\r
-            smem[threadIdx.x] += smem[threadIdx.x + 1];\r
+            sd1[tid] = sdy[tid];\r
+            sdabs1[tid] = ::fabs(sdy[tid]);\r
+            sd2[tid] = 0;\r
+            sdabs2[tid] = 0;\r
          }\r
+        else\r
+        {\r
+            sd1[tid] = 0;\r
+            sdabs1[tid] = 0;\r
+            sd2[tid] = sdy[tid];\r
+            sdabs2[tid] = ::fabs(sdy[tid]);\r
+        }\r
+        __syncthreads();\r
  \r
-        // compute length (square root)\r
-        __shared__ float len;\r
-        if (threadIdx.x == 0)\r
+        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+        __syncthreads();\r
+\r
+        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
+        if (tid == 0)\r
          {\r
-            len = sqrtf(sqDesc[0]);\r
+            descriptors_block[4] = sd1[0];\r
+            descriptors_block[5] = sdabs1[0];\r
+            descriptors_block[6] = sd2[0];\r
+            descriptors_block[7] = sdabs2[0];\r
          }\r
+    }\r
+}\r
+\r
+template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)\r
+{\r
+    // no need for thread ID\r
+    float* descriptor_base = descriptors.ptr(blockIdx.x);\r
+\r
+    // read in the unnormalized descriptor values (squared)\r
+    __shared__ float sqDesc[BLOCK_DIM_X];\r
+    const float lookup = descriptor_base[threadIdx.x];\r
+    sqDesc[threadIdx.x] = lookup * lookup;\r
+    __syncthreads();\r
+\r
+    if (BLOCK_DIM_X >= 128)\r
+    {\r
+        if (threadIdx.x < 64)\r
+            sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
          __syncthreads();\r
+    }\r
  \r
-        // normalize and store in output\r
-        descriptor_base[threadIdx.x] = lookup / len;\r
+    // reduction to get total\r
+    if (threadIdx.x < 32)\r
+    {\r
+        volatile float* smem = sqDesc;\r
+\r
+        smem[threadIdx.x] += smem[threadIdx.x + 32];\r
+        smem[threadIdx.x] += smem[threadIdx.x + 16];\r
+        smem[threadIdx.x] += smem[threadIdx.x + 8];\r
+        smem[threadIdx.x] += smem[threadIdx.x + 4];\r
+        smem[threadIdx.x] += smem[threadIdx.x + 2];\r
+        smem[threadIdx.x] += smem[threadIdx.x + 1];\r
      }\r
  \r
-    void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
-        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)\r
+    // compute length (square root)\r
+    __shared__ float len;\r
+    if (threadIdx.x == 0)\r
      {\r
-        // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
-        \r
-        if (descriptors.cols == 64)\r
-        {\r
-            compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);\r
-            cudaSafeCall( cudaGetLastError() );\r
+        len = sqrtf(sqDesc[0]);\r
+    }\r
+    __syncthreads();\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+    // normalize and store in output\r
+    descriptor_base[threadIdx.x] = lookup / len;\r
+}\r
  \r
-            normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
-            cudaSafeCall( cudaGetLastError() );\r
+void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
+    const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)\r
+{\r
+    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D\r
+    \r
+    if (descriptors.cols == 64)\r
+    {\r
+        compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-        else\r
-        {\r
-            compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            \r
-            cudaSafeCall( cudaGetLastError() );\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
+        normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);\r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            \r
-            cudaSafeCall( cudaGetLastError() );\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+    }\r
+    else\r
+    {\r
+        compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            \r
+        cudaSafeCall( cudaGetLastError() );\r
  \r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
+\r
+        normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            \r
+        cudaSafeCall( cudaGetLastError() );\r
+\r
+        cudaSafeCall( cudaDeviceSynchronize() );\r
      }\r
-}}}\r
+}\r
+\r
+} // namespace surf\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp

index 880794b..fee4507 100644 (file)
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -71,16 +71,20 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
  \r
  #include "opencv2/gpu/stream_accessor.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device {            \r
-    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T>\r
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
-    template <typename T>\r
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);\r
+\r
+template <typename T>\r
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
+template <typename T>\r
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+\r
+void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
-}}}\r
+using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
  struct Stream::Impl\r
  {\r
@@ -101,14 +105,14 @@ namespace
      void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)\r
      {\r
          Scalar_<T> sf = s;\r
-        device::set_to_gpu(src, sf.val, src.channels(), stream);\r
+        set_to_gpu(src, sf.val, src.channels(), stream);\r
      }\r
  \r
      template <typename T>\r
      void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)\r
      {\r
          Scalar_<T> sf = s;\r
-        device::set_to_gpu(src, sf.val, mask, src.channels(), stream);\r
+        set_to_gpu(src, sf.val, mask, src.channels(), stream);\r
      }\r
  }\r
  \r
@@ -255,7 +259,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
          psrc = &(temp = src);\r
  \r
      dst.create( src.size(), rtype );\r
-    device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);\r
+    convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);\r
  }\r
  \r
  cv::gpu::Stream::operator bool() const\r
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp

index 51acac7..0b6957d 100644 (file)
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -123,18 +123,19 @@ namespace
  ////////////////////////////////////////////////////////////////////////\r
  // add\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T, typename D> \r
-    void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T, typename D> \r
+void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+template <typename T, typename D> \r
+void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
  \r
-    template <typename T, typename D> \r
-    void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
  \r
@@ -173,7 +174,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
  \r
  void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
  \r
@@ -235,18 +236,19 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
  ////////////////////////////////////////////////////////////////////////\r
  // subtract\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T, typename D> \r
-    void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T, typename D> \r
+void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
  \r
-    template <typename T, typename D> \r
-    void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
-}}}\r
+template <typename T, typename D> \r
+void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
  \r
@@ -285,7 +287,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
  \r
  void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);\r
  \r
@@ -347,21 +349,22 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
  ////////////////////////////////////////////////////////////////////////\r
  // multiply\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
-    void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
+void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
+\r
+template <typename T, typename D> \r
+void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
  \r
-    template <typename T, typename D> \r
-    void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+template <typename T, typename D> \r
+void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
  \r
-    template <typename T, typename D> \r
-    void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
  \r
@@ -419,7 +422,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
  \r
  void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
  \r
@@ -469,24 +472,25 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
  ////////////////////////////////////////////////////////////////////////\r
  // divide\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
-    void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
-    \r
-    template <typename T, typename D> \r
-    void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    \r
-    template <typename T, typename D> \r
-    void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
-    \r
-    template <typename T, typename D> \r
-    void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);\r
+void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);\r
+\r
+template <typename T, typename D> \r
+void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+template <typename T, typename D> \r
+void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
+\r
+template <typename T, typename D> \r
+void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
  \r
@@ -544,7 +548,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
  \r
  void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);\r
  \r
@@ -593,7 +597,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
  \r
  void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
  \r
@@ -626,18 +630,19 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
  //////////////////////////////////////////////////////////////////////////////\r
  // absdiff\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T> \r
-    void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    \r
-    template <typename T> \r
-    void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T>\r
+void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+template <typename T> \r
+void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
  \r
@@ -709,7 +714,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
  \r
  void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);\r
  \r
@@ -753,17 +758,18 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
  //////////////////////////////////////////////////////////////////////////////\r
  // Comparison of two matrixes\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)\r
  {\r
-    using namespace cv::gpu::device;\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);\r
  \r
@@ -829,13 +835,14 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
  //////////////////////////////////////////////////////////////////////////////\r
  // Unary bitwise logical operations\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T>\r
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
-}}}\r
+void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
@@ -843,20 +850,23 @@ namespace
      {\r
          dst.create(src.size(), src.type());\r
  \r
-        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), \r
-                                              dst.channels(), src, dst, stream);\r
+        OPENCV_DEVICE_NAMESPACE_ bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);\r
      }\r
  \r
  \r
      void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
      {\r
-        using namespace cv::gpu;\r
+        using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
          typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-        static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>, \r
-                                   device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>,\r
-                                   device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>,\r
-                                   device::bitwiseMaskNotCaller<unsigned int>};\r
+\r
+        static Caller callers[] = \r
+        {\r
+            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>, \r
+            bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,\r
+            bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,\r
+            bitwiseMaskNotCaller<unsigned int>\r
+        };\r
  \r
          CV_Assert(mask.type() == CV_8U && mask.size() == src.size());\r
          dst.create(src.size(), src.type());\r
@@ -874,33 +884,33 @@ namespace
  void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)\r
  {\r
      if (mask.empty())\r
-        ::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));\r
+        bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));\r
      else\r
-        ::bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));\r
+        bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));\r
  }\r
  \r
  \r
  //////////////////////////////////////////////////////////////////////////////\r
  // Binary bitwise logical operations\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T>\r
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
  \r
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
+template <typename T>\r
+void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
  \r
-    template <typename T>\r
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
+void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
  \r
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
+template <typename T>\r
+void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
  \r
-    template <typename T>\r
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
-}}}\r
+void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
@@ -909,20 +919,22 @@ namespace
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
  \r
-        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), \r
-                                             dst.channels(), src1, src2, dst, stream);\r
+        OPENCV_DEVICE_NAMESPACE_ bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
      }\r
  \r
-\r
      void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
      {\r
-        using namespace cv::gpu;\r
+        using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
          typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-        static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>, \r
-                                   device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>,\r
-                                   device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>,\r
-                                   device::bitwiseMaskOrCaller<unsigned int>};\r
+\r
+        static Caller callers[] = \r
+        {\r
+            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>, \r
+            bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,\r
+            bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,\r
+            bitwiseMaskOrCaller<unsigned int>\r
+        };\r
  \r
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
@@ -940,20 +952,23 @@ namespace
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
  \r
-        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), \r
-                                              dst.channels(), src1, src2, dst, stream);\r
+        OPENCV_DEVICE_NAMESPACE_ bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
      }\r
  \r
  \r
      void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
      {\r
-        using namespace cv::gpu;\r
+        using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
          typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-        static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>, \r
-                                   device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>,\r
-                                   device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>,\r
-                                   device::bitwiseMaskAndCaller<unsigned int>};\r
+\r
+        static Caller callers[] = \r
+        {\r
+            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>, \r
+            bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,\r
+            bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,\r
+            bitwiseMaskAndCaller<unsigned int>\r
+        };\r
  \r
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
@@ -971,20 +986,23 @@ namespace
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
  \r
-        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), \r
-                                              dst.channels(), src1, src2, dst, stream);\r
+        OPENCV_DEVICE_NAMESPACE_ bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);\r
      }\r
  \r
  \r
      void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)\r
      {\r
-        using namespace cv::gpu;\r
+        using namespace OPENCV_DEVICE_NAMESPACE;\r
  \r
          typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);\r
-        static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>, \r
-                                   device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>,\r
-                                   device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>,\r
-                                   device::bitwiseMaskXorCaller<unsigned int>};\r
+\r
+        static Caller callers[] = \r
+        {\r
+            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>, \r
+            bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,\r
+            bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,\r
+            bitwiseMaskXorCaller<unsigned int>\r
+        };\r
  \r
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
@@ -1001,47 +1019,48 @@ namespace
  void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)\r
  {\r
      if (mask.empty())\r
-        ::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));\r
+        bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));\r
      else\r
-        ::bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));\r
+        bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));\r
  }\r
  \r
  \r
  void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)\r
  {\r
      if (mask.empty())\r
-        ::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));\r
+        bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));\r
      else\r
-        ::bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));\r
+        bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));\r
  }\r
  \r
  \r
  void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)\r
  {\r
      if (mask.empty())\r
-        ::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));\r
+        bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));\r
      else\r
-        ::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));\r
+        bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));\r
  }\r
  \r
  \r
  //////////////////////////////////////////////////////////////////////////////\r
  // Minimum and maximum operations\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T>\r
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T>\r
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+template <typename T>\r
+void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
  \r
-    template <typename T>\r
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+template <typename T>\r
+void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
  \r
-    template <typename T>\r
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
-}}}\r
+template <typename T>\r
+void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+\r
+template <typename T>\r
+void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
@@ -1050,14 +1069,14 @@ namespace
      {\r
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
-        device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
+        OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
      }\r
  \r
      template <typename T>\r
      void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)\r
      {\r
          dst.create(src1.size(), src1.type());\r
-        device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
+        OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
      }\r
      \r
      template <typename T>\r
@@ -1065,14 +1084,14 @@ namespace
      {\r
          CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());\r
          dst.create(src1.size(), src1.type());\r
-        device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
+        OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);\r
      }\r
  \r
      template <typename T>\r
      void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)\r
      {\r
          dst.create(src1.size(), src1.type());\r
-        device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
+        OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);\r
      }\r
  }\r
  \r
@@ -1136,18 +1155,18 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
  ////////////////////////////////////////////////////////////////////////\r
  // threshold\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T>\r
-    void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,\r
-        cudaStream_t stream);\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T>\r
+void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
      template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)\r
      {\r
-        device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);\r
+        OPENCV_DEVICE_NAMESPACE_ threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);\r
      }\r
  }\r
  \r
@@ -1204,24 +1223,27 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
  ////////////////////////////////////////////////////////////////////////\r
  // pow\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template<typename T>\r
-    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template<typename T>\r
+void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)\r
-{    \r
-    CV_Assert( src.depth() != CV_64F );\r
+{\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
+\r
+    CV_Assert(src.depth() != CV_64F);\r
      dst.create(src.size(), src.type());\r
  \r
      typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);\r
  \r
      static const caller_t callers[] = \r
      {\r
-        device::pow_caller<unsigned char>,  device::pow_caller<signed char>, \r
-        device::pow_caller<unsigned short>, device::pow_caller<short>, \r
-        device::pow_caller<int>, device::pow_caller<float>\r
+        pow_caller<unsigned char>,  pow_caller<signed char>, \r
+        pow_caller<unsigned short>, pow_caller<short>, \r
+        pow_caller<int>, pow_caller<float>\r
      };\r
  \r
      callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));    \r
@@ -1230,14 +1252,17 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
  ////////////////////////////////////////////////////////////////////////\r
  // addWeighted\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template <typename T1, typename T2, typename D>\r
-    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
-}}}\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T1, typename T2, typename D>\r
+void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE;\r
+\r
      CV_Assert(src1.size() == src2.size());\r
      CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));\r
  \r
@@ -1256,8 +1281,6 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
  \r
      typedef void (*caller_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);\r
  \r
-    using namespace cv::gpu::device;\r
-\r
      static const caller_t callers[7][7][7] =\r
      {\r
          {\r
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp

index b5e7509..f959e31 100644 (file)
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -735,14 +735,21 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
  ////////////////////////////////////////////////////////////////////////////////////////////////////\r
  // Separable Linear Filter\r
  \r
-namespace cv { namespace gpu { namespace filters\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace row_filter\r
  {\r
      template <typename T, typename D>\r
      void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
+}\r
  \r
+namespace column_filter\r
+{\r
      template <typename T, typename D>\r
      void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
@@ -796,6 +803,8 @@ namespace
  \r
  Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ row_filter;\r
+\r
      static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};\r
      \r
      if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))\r
@@ -837,25 +846,25 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
      switch (srcType)\r
      {\r
      case CV_8UC1:\r
-        func = filters::linearRowFilter_gpu<uchar, float>;\r
+        func = linearRowFilter_gpu<uchar, float>;\r
          break;\r
      case CV_8UC4:\r
-        func = filters::linearRowFilter_gpu<uchar4, float4>;\r
+        func = linearRowFilter_gpu<uchar4, float4>;\r
          break;\r
      /*case CV_16SC1:\r
-        func = filters::linearRowFilter_gpu<short, float>;\r
+        func = linearRowFilter_gpu<short, float>;\r
          break;*/\r
      /*case CV_16SC2:\r
-        func = filters::linearRowFilter_gpu<short2, float2>;\r
+        func = linearRowFilter_gpu<short2, float2>;\r
          break;*/\r
      case CV_16SC3:\r
-        func = filters::linearRowFilter_gpu<short3, float3>;\r
+        func = linearRowFilter_gpu<short3, float3>;\r
          break;\r
      case CV_32SC1:\r
-        func = filters::linearRowFilter_gpu<int, float>;\r
+        func = linearRowFilter_gpu<int, float>;\r
          break;\r
      case CV_32FC1:\r
-        func = filters::linearRowFilter_gpu<float, float>;\r
+        func = linearRowFilter_gpu<float, float>;\r
          break;\r
      }\r
  \r
@@ -909,6 +918,8 @@ namespace
  \r
  Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ column_filter;\r
+\r
      static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};\r
      \r
      if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))\r
@@ -950,25 +961,25 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
      switch (dstType)\r
      {\r
      case CV_8UC1:\r
-        func = filters::linearColumnFilter_gpu<float, uchar>;\r
+        func = linearColumnFilter_gpu<float, uchar>;\r
          break;\r
      case CV_8UC4:\r
-        func = filters::linearColumnFilter_gpu<float4, uchar4>;\r
+        func = linearColumnFilter_gpu<float4, uchar4>;\r
          break;\r
      /*case CV_16SC1:\r
-        func = filters::linearColumnFilter_gpu<float, short>;\r
+        func = linearColumnFilter_gpu<float, short>;\r
          break;*/\r
      /*case CV_16SC2:\r
-        func = filters::linearColumnFilter_gpu<float2, short2>;\r
+        func = linearColumnFilter_gpu<float2, short2>;\r
          break;*/\r
      case CV_16SC3:\r
-        func = filters::linearColumnFilter_gpu<float3, short3>;\r
+        func = linearColumnFilter_gpu<float3, short3>;\r
          break;\r
      case CV_32SC1:\r
-        func = filters::linearColumnFilter_gpu<float, int>;\r
+        func = linearColumnFilter_gpu<float, int>;\r
          break;\r
      case CV_32FC1:\r
-        func = filters::linearColumnFilter_gpu<float, float>;\r
+        func = linearColumnFilter_gpu<float, float>;\r
          break;\r
      }\r
  \r
diff --git a/modules/gpu/src/gpumat.cpp b/modules/gpu/src/gpumat.cpp

deleted file mode 100644 (file)

index b1818a5..0000000
--- a/modules/gpu/src/gpumat.cpp
+++ /dev/null
@@ -1,863 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////\r
-//\r
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
-//\r
-//  By downloading, copying, installing or using the software you agree to this license.\r
-//  If you do not agree to this license, do not download, install,\r
-//  copy or use the software.\r
-//\r
-//\r
-//                           License Agreement\r
-//                For Open Source Computer Vision Library\r
-//\r
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
-// Third party copyrights are property of their respective owners.\r
-//\r
-// Redistribution and use in source and binary forms, with or without modification,\r
-// are permitted provided that the following conditions are met:\r
-//\r
-//   * Redistribution's of source code must retain the above copyright notice,\r
-//     this list of conditions and the following disclaimer.\r
-//\r
-//   * Redistribution's in binary form must reproduce the above copyright notice,\r
-//     this list of conditions and the following disclaimer in the documentation\r
-//     and/or other materials provided with the distribution.\r
-//\r
-//   * The name of the copyright holders may not be used to endorse or promote products\r
-//     derived from this software without specific prior written permission.\r
-//\r
-// This software is provided by the copyright holders and contributors "as is" and\r
-// any express or implied warranties, including, but not limited to, the implied\r
-// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
-// In no event shall the Intel Corporation or contributors be liable for any direct,\r
-// indirect, incidental, special, exemplary, or consequential damages\r
-// (including, but not limited to, procurement of substitute goods or services;\r
-// loss of use, data, or profits; or business interruption) however caused\r
-// and on any theory of liability, whether in contract, strict liability,\r
-// or tort (including negligence or otherwise) arising in any way out of\r
-// the use of this software, even if advised of the possibility of such damage.\r
-//\r
-//M*/\r
-\r
-#include "precomp.hpp"\r
-\r
-using namespace cv;\r
-using namespace cv::gpu;\r
-using namespace std;\r
-\r
-cv::gpu::GpuMat::GpuMat(const GpuMat& m) : \r
-    flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)\r
-{\r
-    if (refcount)\r
-        CV_XADD(refcount, 1);\r
-}\r
-\r
-cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : \r
-    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_), \r
-    step(step_), data((uchar*)data_), refcount(0),\r
-    datastart((uchar*)data_), dataend((uchar*)data_)\r
-{\r
-    size_t minstep = cols * elemSize();\r
-\r
-    if (step == Mat::AUTO_STEP)\r
-    {\r
-        step = minstep;\r
-        flags |= Mat::CONTINUOUS_FLAG;\r
-    }\r
-    else\r
-    {\r
-        if (rows == 1) \r
-            step = minstep;\r
-\r
-        CV_DbgAssert(step >= minstep);\r
-\r
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;\r
-    }\r
-    dataend += step * (rows - 1) + minstep;\r
-}\r
-\r
-cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : \r
-    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width),\r
-    step(step_), data((uchar*)data_), refcount(0),\r
-    datastart((uchar*)data_), dataend((uchar*)data_)\r
-{\r
-    size_t minstep = cols * elemSize();\r
-\r
-    if (step == Mat::AUTO_STEP)\r
-    {\r
-        step = minstep;\r
-        flags |= Mat::CONTINUOUS_FLAG;\r
-    }\r
-    else\r
-    {\r
-        if (rows == 1) \r
-            step = minstep;\r
-\r
-        CV_DbgAssert(step >= minstep);\r
-\r
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;\r
-    }\r
-    dataend += step * (rows - 1) + minstep;\r
-}\r
-\r
-cv::gpu::GpuMat::GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange)\r
-{\r
-    flags = m.flags;\r
-    step = m.step; refcount = m.refcount;\r
-    data = m.data; datastart = m.datastart; dataend = m.dataend;\r
-\r
-    if (rowRange == Range::all())\r
-        rows = m.rows;\r
-    else\r
-    {\r
-        CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows);\r
-\r
-        rows = rowRange.size();\r
-        data += step*rowRange.start;\r
-    }\r
-\r
-    if (colRange == Range::all())\r
-        cols = m.cols;\r
-    else\r
-    {\r
-        CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols);\r
-\r
-        cols = colRange.size();\r
-        data += colRange.start*elemSize();\r
-        flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;\r
-    }\r
-\r
-    if (rows == 1)\r
-        flags |= Mat::CONTINUOUS_FLAG;\r
-\r
-    if (refcount)\r
-        CV_XADD(refcount, 1);\r
-\r
-    if (rows <= 0 || cols <= 0)\r
-        rows = cols = 0;\r
-}\r
-\r
-cv::gpu::GpuMat::GpuMat(const GpuMat& m, const Rect& roi) : \r
-    flags(m.flags), rows(roi.height), cols(roi.width),\r
-    step(m.step), data(m.data + roi.y*step), refcount(m.refcount),\r
-    datastart(m.datastart), dataend(m.dataend)\r
-{\r
-    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;\r
-    data += roi.x * elemSize();\r
-\r
-    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows);\r
-\r
-    if (refcount)\r
-        CV_XADD(refcount, 1);\r
-\r
-    if (rows <= 0 || cols <= 0)\r
-        rows = cols = 0;\r
-}\r
-\r
-cv::gpu::GpuMat::GpuMat(const Mat& m) : \r
-    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) \r
-{ \r
-    upload(m); \r
-}\r
-\r
-GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m)\r
-{\r
-    if (this != &m)\r
-    {\r
-        GpuMat temp(m);\r
-        swap(temp);\r
-    }\r
-\r
-    return *this;\r
-}\r
-\r
-void cv::gpu::GpuMat::swap(GpuMat& b)\r
-{\r
-    std::swap(flags, b.flags);\r
-    std::swap(rows, b.rows); \r
-    std::swap(cols, b.cols);\r
-    std::swap(step, b.step); \r
-    std::swap(data, b.data);\r
-    std::swap(datastart, b.datastart);\r
-    std::swap(dataend, b.dataend);\r
-    std::swap(refcount, b.refcount);\r
-}\r
-\r
-void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const\r
-{\r
-    size_t esz = elemSize();\r
-    ptrdiff_t delta1 = data - datastart;\r
-    ptrdiff_t delta2 = dataend - datastart;\r
-\r
-    CV_DbgAssert(step > 0);\r
-\r
-    if (delta1 == 0)\r
-        ofs.x = ofs.y = 0;\r
-    else\r
-    {\r
-        ofs.y = static_cast<int>(delta1 / step);\r
-        ofs.x = static_cast<int>((delta1 - step * ofs.y) / esz);\r
-\r
-        CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz);\r
-    }\r
-\r
-    size_t minstep = (ofs.x + cols) * esz;\r
-\r
-    wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / step + 1), ofs.y + rows);\r
-    wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);\r
-}\r
-\r
-GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)\r
-{\r
-    Size wholeSize; \r
-    Point ofs;\r
-    locateROI(wholeSize, ofs);\r
-\r
-    size_t esz = elemSize();\r
-\r
-    int row1 = std::max(ofs.y - dtop, 0); \r
-    int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);\r
-\r
-    int col1 = std::max(ofs.x - dleft, 0);\r
-    int col2 = std::min(ofs.x + cols + dright, wholeSize.width);\r
-\r
-    data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;\r
-    rows = row2 - row1; \r
-    cols = col2 - col1;\r
-\r
-    if (esz * cols == step || rows == 1)\r
-        flags |= Mat::CONTINUOUS_FLAG;\r
-    else\r
-        flags &= ~Mat::CONTINUOUS_FLAG;\r
-\r
-    return *this;\r
-}\r
-\r
-GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const\r
-{\r
-    GpuMat hdr = *this;\r
-\r
-    int cn = channels();\r
-    if (new_cn == 0)\r
-        new_cn = cn;\r
-\r
-    int total_width = cols * cn;\r
-\r
-    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)\r
-        new_rows = rows * total_width / new_cn;\r
-\r
-    if (new_rows != 0 && new_rows != rows)\r
-    {\r
-        int total_size = total_width * rows;\r
-\r
-        if (!isContinuous())\r
-            CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");\r
-\r
-        if ((unsigned)new_rows > (unsigned)total_size)\r
-            CV_Error(CV_StsOutOfRange, "Bad new number of rows");\r
-\r
-        total_width = total_size / new_rows;\r
-\r
-        if (total_width * new_rows != total_size)\r
-            CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");\r
-\r
-        hdr.rows = new_rows;\r
-        hdr.step = total_width * elemSize1();\r
-    }\r
-\r
-    int new_width = total_width / new_cn;\r
-\r
-    if (new_width * new_cn != total_width)\r
-        CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");\r
-\r
-    hdr.cols = new_width;\r
-    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);\r
-\r
-    return hdr;\r
-}\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-class GpuFuncTable\r
-{\r
-public:\r
-    virtual ~GpuFuncTable() {}\r
-\r
-    virtual void copy(const Mat& src, GpuMat& dst) const = 0;\r
-    virtual void copy(const GpuMat& src, Mat& dst) const = 0;\r
-    virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;\r
-\r
-    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;\r
-\r
-    virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;\r
-    virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;\r
-\r
-    virtual void setTo(GpuMat& m, const Scalar& s, const GpuMat& mask) const = 0;\r
-\r
-    virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;\r
-    virtual void free(void* devPtr) const = 0;\r
-};\r
-\r
-\r
-#if !defined (HAVE_CUDA)\r
-\r
-class EmptyFuncTable : public GpuFuncTable\r
-{\r
-public:\r
-    void copy(const Mat&, GpuMat&) const { throw_nogpu(); }\r
-    void copy(const GpuMat&, Mat&) const { throw_nogpu(); }\r
-    void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); }\r
-\r
-    void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); }\r
-\r
-    void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); }\r
-    void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); }\r
-\r
-    void setTo(GpuMat&, const Scalar&, const GpuMat&) const { throw_nogpu(); }\r
-\r
-    void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); }\r
-    void free(void*) const {}\r
-};\r
-\r
-const GpuFuncTable* gpuFuncTable()\r
-{\r
-    static EmptyFuncTable empty;\r
-    return &empty;\r
-}\r
-\r
-#else\r
-\r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);\r
-\r
-    template <typename T>\r
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
-    template <typename T>\r
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
-\r
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
-}}}\r
-\r
-namespace\r
-{\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // Convert\r
-\r
-    template<int n> struct NPPTypeTraits;\r
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };\r
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };\r
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };\r
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };\r
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };\r
-\r
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
-\r
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);\r
-    };\r
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>\r
-    {\r
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
-\r
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);\r
-    };\r
-\r
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
-\r
-        static void cvt(const GpuMat& src, GpuMat& dst)\r
-        {\r
-            NppiSize sz;\r
-            sz.width = src.cols;\r
-            sz.height = src.rows;\r
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );\r
-\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>\r
-    {\r
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
-\r
-        static void cvt(const GpuMat& src, GpuMat& dst)\r
-        {\r
-            NppiSize sz;\r
-            sz.width = src.cols;\r
-            sz.height = src.rows;\r
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );\r
-\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-\r
-    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)\r
-    {\r
-        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);\r
-    }\r
-\r
-    //////////////////////////////////////////////////////////////////////////\r
-    // Set\r
-    \r
-    template<int SDEPTH, int SCN> struct NppSetFunc\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);\r
-    };\r
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);\r
-    };\r
-\r
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        static void set(GpuMat& src, const Scalar& s)\r
-        {\r
-            NppiSize sz;\r
-            sz.width = src.cols;\r
-            sz.height = src.rows;\r
-\r
-            Scalar_<src_t> nppS = s;\r
-\r
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );\r
-\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        static void set(GpuMat& src, const Scalar& s)\r
-        {\r
-            NppiSize sz;\r
-            sz.width = src.cols;\r
-            sz.height = src.rows;\r
-\r
-            Scalar_<src_t> nppS = s;\r
-\r
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );\r
-\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-\r
-    template <typename T>\r
-    void kernelSet(GpuMat& src, const Scalar& s)\r
-    {\r
-        Scalar_<T> sf = s;\r
-        device::set_to_gpu(src, sf.val, src.channels(), 0);\r
-    }\r
-\r
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);\r
-    };\r
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);\r
-    };\r
-\r
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)\r
-        {\r
-            NppiSize sz;\r
-            sz.width = src.cols;\r
-            sz.height = src.rows;\r
-\r
-            Scalar_<src_t> nppS = s;\r
-\r
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );\r
-\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>\r
-    {\r
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
-\r
-        static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)\r
-        {\r
-            NppiSize sz;\r
-            sz.width = src.cols;\r
-            sz.height = src.rows;\r
-\r
-            Scalar_<src_t> nppS = s;\r
-\r
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );\r
-\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-        }\r
-    };\r
-\r
-    template <typename T>\r
-    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)\r
-    {\r
-        Scalar_<T> sf = s;\r
-        device::set_to_gpu(src, sf.val, mask, src.channels(), 0);\r
-    }\r
-}\r
-\r
-class CudaFuncTable : public GpuFuncTable\r
-{\r
-public:\r
-    virtual void copy(const Mat& src, GpuMat& dst) const \r
-    { \r
-        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );\r
-    }\r
-    virtual void copy(const GpuMat& src, Mat& dst) const\r
-    { \r
-        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );\r
-    }\r
-    virtual void copy(const GpuMat& src, GpuMat& dst) const\r
-    { \r
-        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );\r
-    }\r
-\r
-    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const \r
-    { \r
-        device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());\r
-    }\r
-\r
-    void convert(const GpuMat& src, GpuMat& dst) const \r
-    { \r
-        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);\r
-        static const caller_t callers[7][7][7] =\r
-        {\r
-            {                \r
-                /*  8U ->  8U */ {0, 0, 0, 0},\r
-                /*  8U ->  8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},\r
-                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},\r
-                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},\r
-                /*  8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
-            },\r
-            {\r
-                /*  8S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8S ->  8S */ {0,0,0,0},\r
-                /*  8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /*  8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
-            },\r
-            {\r
-                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},\r
-                /* 16U ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16U -> 16U */ {0,0,0,0},\r
-                /* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
-            },\r
-            {\r
-                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},\r
-                /* 16S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16S -> 16S */ {0,0,0,0},\r
-                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
-            },\r
-            {\r
-                /* 32S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32S -> 32S */ {0,0,0,0},\r
-                /* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
-            },\r
-            {\r
-                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 32F -> 32F */ {0,0,0,0},\r
-                /* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
-            },\r
-            {\r
-                /* 64F ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 64F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
-                /* 64F -> 64F */ {0,0,0,0}\r
-            }\r
-        };\r
-\r
-        caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];\r
-        CV_DbgAssert(func != 0);\r
-\r
-        func(src, dst);\r
-    }\r
-    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const \r
-    { \r
-        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);\r
-    }\r
-\r
-    void setTo(GpuMat& m, const Scalar& s, const GpuMat& mask) const\r
-    {\r
-        NppiSize sz;\r
-        sz.width  = m.cols;\r
-        sz.height = m.rows;\r
-\r
-        if (mask.empty())\r
-        {\r
-            if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)\r
-            {\r
-                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );\r
-                return;\r
-            }\r
-\r
-            if (m.depth() == CV_8U)\r
-            {\r
-                int cn = m.channels();\r
-\r
-                if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))\r
-                {\r
-                    int val = saturate_cast<gpu::uchar>(s[0]);\r
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );\r
-                    return;\r
-                }\r
-            }\r
-\r
-            typedef void (*caller_t)(GpuMat& src, const Scalar& s);\r
-            static const caller_t callers[7][4] =\r
-            {\r
-                {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<gpu::uchar>,kernelSet<gpu::uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},\r
-                {kernelSet<gpu::schar>,kernelSet<gpu::schar>,kernelSet<gpu::schar>,kernelSet<gpu::schar>},\r
-                {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<gpu::ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},\r
-                {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},\r
-                {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},\r
-                {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},\r
-                {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}\r
-            };\r
-\r
-            callers[m.depth()][m.channels() - 1](m, s);\r
-        }\r
-        else\r
-        {\r
-            typedef void (*caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);\r
-\r
-            static const caller_t callers[7][4] =\r
-            {\r
-                {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<gpu::uchar>,kernelSetMask<gpu::uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},\r
-                {kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>},\r
-                {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<gpu::ushort>,kernelSetMask<gpu::ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},\r
-                {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},\r
-                {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},\r
-                {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},\r
-                {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}\r
-            };\r
-\r
-            callers[m.depth()][m.channels() - 1](m, s, mask);\r
-        }\r
-    }\r
-\r
-    void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const\r
-    {\r
-        cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );\r
-    }\r
-\r
-    void free(void* devPtr) const\r
-    {\r
-        cudaFree(devPtr);\r
-    }\r
-};\r
-\r
-const GpuFuncTable* gpuFuncTable()\r
-{\r
-    static CudaFuncTable cuda;\r
-    return &cuda;\r
-}\r
-\r
-#endif\r
-\r
-void cv::gpu::GpuMat::upload(const Mat& m)\r
-{\r
-    CV_DbgAssert(!m.empty());\r
-\r
-    create(m.size(), m.type());\r
-\r
-    gpuFuncTable()->copy(m, *this);\r
-}\r
-\r
-void cv::gpu::GpuMat::download(cv::Mat& m) const\r
-{\r
-    CV_DbgAssert(!empty());\r
-\r
-    m.create(size(), type());\r
-\r
-    gpuFuncTable()->copy(*this, m);\r
-}\r
-\r
-void cv::gpu::GpuMat::copyTo(GpuMat& m) const\r
-{\r
-    CV_DbgAssert(!empty());\r
-\r
-    m.create(size(), type());\r
-\r
-    gpuFuncTable()->copy(*this, m);\r
-}\r
-\r
-void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const\r
-{\r
-    if (mask.empty())\r
-        copyTo(mat);\r
-    else\r
-    {\r
-        mat.create(size(), type());\r
-\r
-        gpuFuncTable()->copyWithMask(*this, mat, mask);\r
-    }\r
-}\r
-\r
-void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const\r
-{\r
-    bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();\r
-\r
-    if (rtype < 0)\r
-        rtype = type();\r
-    else\r
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());\r
-\r
-    int scn = channels();\r
-    int sdepth = depth();\r
-    int ddepth = CV_MAT_DEPTH(rtype);\r
-    if (sdepth == ddepth && noScale)\r
-    {\r
-        copyTo(dst);\r
-        return;\r
-    }\r
-\r
-    GpuMat temp;\r
-    const GpuMat* psrc = this;\r
-    if (sdepth != ddepth && psrc == &dst)\r
-    {\r
-        temp = *this;\r
-        psrc = &temp;\r
-    }\r
-\r
-    dst.create(size(), rtype);\r
-\r
-    if (noScale)\r
-        gpuFuncTable()->convert(*psrc, dst);\r
-    else\r
-        gpuFuncTable()->convert(*psrc, dst, alpha, beta);\r
-}\r
-\r
-GpuMat& cv::gpu::GpuMat::setTo(const Scalar& s, const GpuMat& mask)\r
-{\r
-    CV_Assert(mask.empty() || mask.type() == CV_8UC1);\r
-    CV_DbgAssert(!empty());\r
-\r
-    gpuFuncTable()->setTo(*this, s, mask);    \r
-\r
-    return *this;\r
-}\r
-\r
-void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)\r
-{\r
-    _type &= TYPE_MASK;\r
-\r
-    if (rows == _rows && cols == _cols && type() == _type && data)\r
-        return;\r
-\r
-    if (data)\r
-        release();\r
-\r
-    CV_DbgAssert(_rows >= 0 && _cols >= 0);\r
-\r
-    if (_rows > 0 && _cols > 0)\r
-    {\r
-        flags = Mat::MAGIC_VAL + _type;\r
-        rows = _rows;\r
-        cols = _cols;\r
-\r
-        size_t esz = elemSize();\r
-\r
-        void* devPtr;\r
-        gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows);\r
-\r
-        // Single row must be continuous\r
-        if (rows == 1)\r
-            step = esz * cols;\r
-\r
-        if (esz * cols == step)\r
-            flags |= Mat::CONTINUOUS_FLAG;\r
-\r
-        int64 _nettosize = static_cast<int64>(step) * rows;\r
-        size_t nettosize = static_cast<size_t>(_nettosize);\r
-\r
-        datastart = data = static_cast<uchar*>(devPtr);\r
-        dataend = data + nettosize;\r
-\r
-        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));\r
-        *refcount = 1;\r
-    }\r
-}\r
-\r
-void cv::gpu::GpuMat::release()\r
-{\r
-    if (refcount && CV_XADD(refcount, -1) == 1)\r
-    {\r
-        fastFree(refcount);\r
-\r
-        gpuFuncTable()->free(datastart);\r
-    }\r
-\r
-    data = datastart = dataend = 0;\r
-    step = rows = cols = 0;\r
-    refcount = 0;\r
-}\r
diff --git a/modules/gpu/src/hog.cpp b/modules/gpu/src/hog.cpp

index da00258..2167381 100644 (file)
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@@ -60,40 +60,44 @@ std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128() { throw_nog
  \r
  #else\r
  \r
-namespace cv { namespace gpu { namespace hog {\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
-                      int nblocks_win_x, int nblocks_win_y);\r
-\r
-void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,\r
-                   int height, int width, const cv::gpu::DevMem2Df& grad, \r
-                   const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);\r
-\r
-void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
-                     int height, int width, float* block_hists, float threshold);\r
-\r
-void classify_hists(int win_height, int win_width, int block_stride_y, \r
-                    int block_stride_x, int win_stride_y, int win_stride_x, int height, \r
-                    int width, float* block_hists, float* coefs, float free_coef, \r
-                    float threshold, unsigned char* labels);\r
-\r
-void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
-                            cv::gpu::DevMem2Df descriptors);\r
-void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
-                            cv::gpu::DevMem2Df descriptors);\r
-\r
-void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
-                            float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
-void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
-                            float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
-\r
-void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
-void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
+namespace hog \r
+{\r
+    void set_up_constants(int nbins, int block_stride_x, int block_stride_y, \r
+                          int nblocks_win_x, int nblocks_win_y);\r
+\r
+    void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,\r
+                       int height, int width, const cv::gpu::DevMem2Df& grad, \r
+                       const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);\r
+\r
+    void normalize_hists(int nbins, int block_stride_x, int block_stride_y, \r
+                         int height, int width, float* block_hists, float threshold);\r
+\r
+    void classify_hists(int win_height, int win_width, int block_stride_y, \r
+                        int block_stride_x, int win_stride_y, int win_stride_x, int height, \r
+                        int width, float* block_hists, float* coefs, float free_coef, \r
+                        float threshold, unsigned char* labels);\r
+\r
+    void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
+                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+                                cv::gpu::DevMem2Df descriptors);\r
+    void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, \r
+                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists, \r
+                                cv::gpu::DevMem2Df descriptors);\r
+\r
+    void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
+                                float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
+    void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, \r
+                                float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);\r
+\r
+    void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
+    void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);\r
+}\r
  \r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
+using namespace OPENCV_DEVICE_NAMESPACE;\r
      \r
  cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size, \r
                                                                           int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)\r
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp

index 3b47447..8973280 100644 (file)
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -107,15 +107,20 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
  ////////////////////////////////////////////////////////////////////////\r
  // remap\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace remap \r
  {\r
-    template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, \r
-                                         int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
-}}}\r
+    template <typename T> \r
+    void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, \r
+                   int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)\r
  {\r
-    using namespace cv::gpu::imgproc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ remap;\r
  \r
      typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, \r
          int borderMode, const float* borderValue, cudaStream_t stream, int cc);\r
@@ -155,13 +160,19 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
  ////////////////////////////////////////////////////////////////////////\r
  // meanShiftFiltering_GPU\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      if( src.empty() )\r
          CV_Error( CV_StsBadArg, "The input image is empty" );\r
  \r
@@ -180,19 +191,25 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
          eps = 1.f;\r
      eps = (float)std::max(criteria.epsilon, 0.0);\r
  \r
-    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));\r
+    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));\r
  }\r
  \r
  ////////////////////////////////////////////////////////////////////////\r
  // meanShiftProc_GPU\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      if( src.empty() )\r
          CV_Error( CV_StsBadArg, "The input image is empty" );\r
  \r
@@ -212,26 +229,32 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
          eps = 1.f;\r
      eps = (float)std::max(criteria.epsilon, 0.0);\r
  \r
-    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));\r
+    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));\r
  }\r
  \r
  ////////////////////////////////////////////////////////////////////////\r
  // drawColorDisp\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);\r
      void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
      template <typename T>\r
      void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)\r
      {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
          dst.create(src.size(), CV_8UC4);\r
  \r
-        imgproc::drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);\r
+        drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);\r
      }\r
  \r
      typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);\r
@@ -249,19 +272,26 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
  ////////////////////////////////////////////////////////////////////////\r
  // reprojectImageTo3D\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);\r
      void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
      template <typename T>\r
      void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)\r
      {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
          xyzw.create(disp.rows, disp.cols, CV_32FC4);\r
-        imgproc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);\r
+\r
+        reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);\r
      }\r
  \r
      typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);\r
@@ -279,10 +309,14 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
  ////////////////////////////////////////////////////////////////////////\r
  // resize\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace resize \r
  {\r
      template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)\r
  {\r
@@ -346,7 +380,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
      }\r
      else\r
      {\r
-        using namespace cv::gpu::imgproc;\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ resize;\r
  \r
          typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);\r
          static const caller_t callers[6][4] = \r
@@ -366,18 +400,24 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
  ////////////////////////////////////////////////////////////////////////\r
  // copyMakeBorder\r
  \r
-namespace cv { namespace gpu {  namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace copy_make_border \r
  {\r
      template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
      template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)\r
      {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ copy_make_border;\r
+\r
          Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));\r
  \r
-        imgproc::copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);\r
+        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);\r
      }\r
  }\r
  \r
@@ -626,16 +666,22 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
  //////////////////////////////////////////////////////////////////////////////\r
  // buildWarpPlaneMaps\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
                              const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,\r
                              cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, \r
                                   float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);\r
      CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);\r
      CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous());\r
@@ -647,23 +693,29 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, cons
  \r
      map_x.create(dst_roi.size(), CV_32F);\r
      map_y.create(dst_roi.size(), CV_32F);\r
-    imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), \r
-                                T.ptr<float>(), scale, StreamAccessor::getStream(stream));\r
+    buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), \r
+                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));\r
  }\r
  \r
  //////////////////////////////////////////////////////////////////////////////\r
  // buildWarpCylyndricalMaps\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
                                    const float k_rinv[9], const float r_kinv[9], float scale,\r
                                    cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
                                         GpuMat& map_x, GpuMat& map_y, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);\r
      CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);\r
  \r
@@ -674,24 +726,29 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K
  \r
      map_x.create(dst_roi.size(), CV_32F);\r
      map_y.create(dst_roi.size(), CV_32F);\r
-    imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),\r
-                                      scale, StreamAccessor::getStream(stream));\r
+    buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));\r
  }\r
  \r
  \r
  //////////////////////////////////////////////////////////////////////////////\r
  // buildWarpSphericalMaps\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,\r
                                  const float k_rinv[9], const float r_kinv[9], float scale,\r
                                  cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,\r
                                       GpuMat& map_x, GpuMat& map_y, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);\r
      CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);\r
  \r
@@ -702,8 +759,7 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K,
  \r
      map_x.create(dst_roi.size(), CV_32F);\r
      map_y.create(dst_roi.size(), CV_32F);\r
-    imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),\r
-                                    scale, StreamAccessor::getStream(stream));\r
+    buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));\r
  }\r
  \r
  ////////////////////////////////////////////////////////////////////////\r
@@ -843,17 +899,24 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
  //////////////////////////////////////////////////////////////////////////////\r
  // columnSum\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc\r
  {\r
      void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      CV_Assert(src.type() == CV_32F);\r
  \r
      dst.create(src.size(), CV_32F);\r
-    imgproc::columnSum_32F(src, dst);\r
+\r
+    columnSum_32F(src, dst);\r
  }\r
  \r
  void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)\r
@@ -1140,7 +1203,6 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, S
      histRange(src, hist, levels, buf, stream);\r
  }\r
  \r
-\r
  void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream)\r
  {\r
      CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);\r
@@ -1183,13 +1245,19 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
      hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));\r
  }\r
  \r
-namespace cv { namespace gpu { namespace histograms\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace hist\r
  {\r
      void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);\r
  \r
      const int PARTIAL_HISTOGRAM256_COUNT = 240;\r
      const int HISTOGRAM256_BIN_COUNT     = 256;\r
-}}}\r
+\r
+    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)\r
  {\r
@@ -1199,7 +1267,7 @@ void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
  \r
  void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)\r
  {\r
-    using namespace cv::gpu::histograms;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ hist;\r
  \r
      CV_Assert(src.type() == CV_8UC1);\r
  \r
@@ -1223,14 +1291,9 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&
      equalizeHist(src, dst, hist, buf, stream);\r
  }\r
  \r
-namespace cv { namespace gpu { namespace histograms\r
-{\r
-    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);\r
-}}}\r
-\r
  void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)\r
  {\r
-    using namespace cv::gpu::histograms;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ hist;\r
  \r
      CV_Assert(src.type() == CV_8UC1);\r
  \r
@@ -1264,13 +1327,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
  ////////////////////////////////////////////////////////////////////////\r
  // cornerHarris & minEgenVal\r
  \r
-namespace cv { namespace gpu { namespace imgproc {\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
+namespace imgproc \r
+{\r
      void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);\r
      void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
      void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);\r
+}\r
  \r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace \r
  {\r
@@ -1316,7 +1382,6 @@ namespace
  \r
  } // Anonymous namespace\r
  \r
-\r
  bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)\r
  {\r
      switch (cpuBorderType)\r
@@ -1356,6 +1421,8 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
  \r
  void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      CV_Assert(borderType == cv::BORDER_REFLECT101 ||\r
                borderType == cv::BORDER_REPLICATE);\r
  \r
@@ -1364,7 +1431,7 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
  \r
      extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);\r
      dst.create(src.size(), CV_32F);\r
-    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));\r
+    cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));\r
  }\r
  \r
  void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)\r
@@ -1381,6 +1448,8 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
  \r
  void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)\r
  {  \r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      CV_Assert(borderType == cv::BORDER_REFLECT101 ||\r
                borderType == cv::BORDER_REPLICATE);\r
  \r
@@ -1389,24 +1458,30 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
  \r
      extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);    \r
      dst.create(src.size(), CV_32F);\r
-    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));\r
+    cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));\r
  }\r
  \r
  //////////////////////////////////////////////////////////////////////////////\r
  // mulSpectrums\r
  \r
-namespace cv { namespace gpu { namespace imgproc \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
  \r
      void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
-}}}\r
+}\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) \r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);\r
-    static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };\r
+\r
+    static Caller callers[] = { mulSpectrums, mulSpectrums_CONJ };\r
  \r
      CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);\r
      CV_Assert(a.size() == b.size());\r
@@ -1420,18 +1495,23 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
  //////////////////////////////////////////////////////////////////////////////\r
  // mulAndScaleSpectrums\r
  \r
-namespace cv { namespace gpu { namespace imgproc \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc \r
  {\r
      void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
  \r
      void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);\r
-}}}\r
+}\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) \r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
      typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);\r
-    static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ };\r
+    static Caller callers[] = { mulAndScaleSpectrums, mulAndScaleSpectrums_CONJ };\r
  \r
      CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);\r
      CV_Assert(a.size() == b.size());\r
@@ -1593,13 +1673,19 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
      convolve(image, templ, result, ccorr, buf);\r
  }\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace imgproc\r
  {\r
      void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)\r
  {\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;\r
+\r
  #ifndef HAVE_CUFFT\r
  \r
      CV_Assert(image.type() == CV_32F);\r
@@ -1622,7 +1708,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
              templ.copyTo(contKernel);\r
      }\r
  \r
-    imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));\r
+    convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));\r
  \r
  #else\r
  \r
@@ -1650,7 +1736,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
                  templ.copyTo(contKernel);\r
          }\r
  \r
-        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));\r
+        convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));\r
      }\r
      else\r
      {\r
@@ -1725,14 +1811,18 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
  //////////////////////////////////////////////////////////////////////////////\r
  // pyrDown\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace pyr_down \r
  {\r
      template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)\r
  {\r
-    using namespace cv::gpu::imgproc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ pyr_down;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
@@ -1761,14 +1851,18 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& st
  //////////////////////////////////////////////////////////////////////////////\r
  // pyrUp\r
  \r
-namespace cv { namespace gpu { namespace imgproc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace pyr_up \r
  {\r
      template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)\r
  {\r
-    using namespace cv::gpu::imgproc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ pyr_up;\r
  \r
      typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);\r
  \r
@@ -1839,8 +1933,10 @@ void cv::gpu::CannyBuf::release()
      trackBuf2.release();\r
  }\r
  \r
-namespace cv { namespace gpu { namespace canny\r
-{    \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace canny \r
+{\r
      void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);\r
  \r
      void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);\r
@@ -1853,13 +1949,15 @@ namespace cv { namespace gpu { namespace canny
      void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);\r
  \r
      void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  namespace\r
  {\r
      void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)\r
      {\r
-        using namespace cv::gpu::canny;\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ canny;\r
  \r
          calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);\r
          \r
@@ -1879,7 +1977,7 @@ void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double hi
  \r
  void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)\r
  {\r
-    using namespace cv::gpu::canny;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ canny;\r
  \r
      CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));\r
      CV_Assert(src.type() == CV_8UC1);\r
@@ -1918,7 +2016,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_
  \r
  void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)\r
  {\r
-    using namespace cv::gpu::canny;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ canny;\r
  \r
      CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));\r
      CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());\r
diff --git a/modules/gpu/src/initialization.cpp b/modules/gpu/src/initialization.cpp

index b13c173..e93c899 100644 (file)
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@@ -271,5 +271,380 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
          setDevice(prev_device_id);\r
  }\r
  \r
+////////////////////////////////////////////////////////////////////\r
+// GpuFuncTable\r
+\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);\r
+\r
+template <typename T>\r
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);\r
+template <typename T>\r
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);\r
+\r
+void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace\r
+{\r
+    //////////////////////////////////////////////////////////////////////////\r
+    // Convert\r
+\r
+    template<int n> struct NPPTypeTraits;\r
+    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };\r
+    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };\r
+    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };\r
+    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };\r
+    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };\r
+\r
+    template<int SDEPTH, int DDEPTH> struct NppConvertFunc\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
+\r
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);\r
+    };\r
+    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>\r
+    {\r
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
+\r
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);\r
+    };\r
+\r
+    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
+\r
+        static void cvt(const GpuMat& src, GpuMat& dst)\r
+        {\r
+            NppiSize sz;\r
+            sz.width = src.cols;\r
+            sz.height = src.rows;\r
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );\r
+\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+        }\r
+    };\r
+    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>\r
+    {\r
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;\r
+\r
+        static void cvt(const GpuMat& src, GpuMat& dst)\r
+        {\r
+            NppiSize sz;\r
+            sz.width = src.cols;\r
+            sz.height = src.rows;\r
+            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );\r
+\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+        }\r
+    };\r
+\r
+    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)\r
+    {\r
+        OPENCV_DEVICE_NAMESPACE_ convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);\r
+    }\r
+\r
+    //////////////////////////////////////////////////////////////////////////\r
+    // Set\r
+    \r
+    template<int SDEPTH, int SCN> struct NppSetFunc\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);\r
+    };\r
+    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);\r
+    };\r
+\r
+    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        static void set(GpuMat& src, Scalar s)\r
+        {\r
+            NppiSize sz;\r
+            sz.width = src.cols;\r
+            sz.height = src.rows;\r
+\r
+            Scalar_<src_t> nppS = s;\r
+\r
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );\r
+\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+        }\r
+    };\r
+    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        static void set(GpuMat& src, Scalar s)\r
+        {\r
+            NppiSize sz;\r
+            sz.width = src.cols;\r
+            sz.height = src.rows;\r
+\r
+            Scalar_<src_t> nppS = s;\r
+\r
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );\r
+\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+        }\r
+    };\r
+\r
+    template <typename T>\r
+    void kernelSet(GpuMat& src, Scalar s)\r
+    {\r
+        Scalar_<T> sf = s;\r
+        OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, src.channels(), 0);\r
+    }\r
+\r
+    template<int SDEPTH, int SCN> struct NppSetMaskFunc\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);\r
+    };\r
+    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);\r
+    };\r
+\r
+    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        static void set(GpuMat& src, Scalar s, const GpuMat& mask)\r
+        {\r
+            NppiSize sz;\r
+            sz.width = src.cols;\r
+            sz.height = src.rows;\r
+\r
+            Scalar_<src_t> nppS = s;\r
+\r
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );\r
+\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+        }\r
+    };\r
+    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>\r
+    {\r
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;\r
+\r
+        static void set(GpuMat& src, Scalar s, const GpuMat& mask)\r
+        {\r
+            NppiSize sz;\r
+            sz.width = src.cols;\r
+            sz.height = src.rows;\r
+\r
+            Scalar_<src_t> nppS = s;\r
+\r
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );\r
+\r
+            cudaSafeCall( cudaDeviceSynchronize() );\r
+        }\r
+    };\r
+\r
+    template <typename T>\r
+    void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)\r
+    {\r
+        Scalar_<T> sf = s;\r
+        OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, mask, src.channels(), 0);\r
+    }\r
+\r
+    class CudaFuncTable : public GpuFuncTable\r
+    {\r
+    public:\r
+        void copy(const Mat& src, GpuMat& dst) const \r
+        { \r
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );\r
+        }\r
+        void copy(const GpuMat& src, Mat& dst) const\r
+        { \r
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );\r
+        }\r
+        void copy(const GpuMat& src, GpuMat& dst) const\r
+        { \r
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );\r
+        }\r
+\r
+        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const \r
+        { \r
+            OPENCV_DEVICE_NAMESPACE_ copy_to_with_mask(src, dst, src.depth(), mask, src.channels());\r
+        }\r
+\r
+        void convert(const GpuMat& src, GpuMat& dst) const \r
+        { \r
+            typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);\r
+            static const caller_t callers[7][7][7] =\r
+            {\r
+                {                \r
+                    /*  8U ->  8U */ {0, 0, 0, 0},\r
+                    /*  8U ->  8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},\r
+                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},\r
+                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},\r
+                    /*  8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
+                },\r
+                {\r
+                    /*  8S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8S ->  8S */ {0,0,0,0},\r
+                    /*  8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /*  8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
+                },\r
+                {\r
+                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},\r
+                    /* 16U ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16U -> 16U */ {0,0,0,0},\r
+                    /* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
+                },\r
+                {\r
+                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},\r
+                    /* 16S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16S -> 16S */ {0,0,0,0},\r
+                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
+                },\r
+                {\r
+                    /* 32S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32S -> 32S */ {0,0,0,0},\r
+                    /* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
+                },\r
+                {\r
+                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 32F -> 32F */ {0,0,0,0},\r
+                    /* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}\r
+                },\r
+                {\r
+                    /* 64F ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 64F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},\r
+                    /* 64F -> 64F */ {0,0,0,0}\r
+                }\r
+            };\r
+\r
+            caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];\r
+            CV_DbgAssert(func != 0);\r
+\r
+            func(src, dst);\r
+        }\r
+\r
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const \r
+        { \r
+            device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);\r
+        }\r
+\r
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const\r
+        {\r
+            NppiSize sz;\r
+            sz.width  = m.cols;\r
+            sz.height = m.rows;\r
+\r
+            if (mask.empty())\r
+            {\r
+                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)\r
+                {\r
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );\r
+                    return;\r
+                }\r
+\r
+                if (m.depth() == CV_8U)\r
+                {\r
+                    int cn = m.channels();\r
+\r
+                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))\r
+                    {\r
+                        int val = saturate_cast<uchar>(s[0]);\r
+                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );\r
+                        return;\r
+                    }\r
+                }\r
+\r
+                typedef void (*caller_t)(GpuMat& src, Scalar s);\r
+                static const caller_t callers[7][4] =\r
+                {\r
+                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},\r
+                    {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},\r
+                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},\r
+                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},\r
+                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},\r
+                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},\r
+                    {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}\r
+                };\r
+\r
+                callers[m.depth()][m.channels() - 1](m, s);\r
+            }\r
+            else\r
+            {\r
+                typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);\r
+\r
+                static const caller_t callers[7][4] =\r
+                {\r
+                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},\r
+                    {kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},\r
+                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},\r
+                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},\r
+                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},\r
+                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},\r
+                    {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}\r
+                };\r
+\r
+                callers[m.depth()][m.channels() - 1](m, s, mask);\r
+            }\r
+        }\r
+\r
+        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const\r
+        {\r
+            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );\r
+        }\r
+\r
+        void free(void* devPtr) const\r
+        {\r
+            cudaFree(devPtr);\r
+        }\r
+    };\r
+\r
+    class Initializer\r
+    {\r
+    public:\r
+        Initializer()\r
+        {\r
+            static CudaFuncTable funcTable;\r
+            setGpuFuncTable(&funcTable);\r
+        }\r
+    };\r
+\r
+    Initializer init;\r
+}\r
+\r
  #endif\r
  \r
diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp

index e74d0fd..4e0b4bf 100644 (file)
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -44,6 +44,7 @@
  \r
  using namespace cv;\r
  using namespace cv::gpu;\r
+using namespace std;\r
  \r
  #if !defined (HAVE_CUDA)\r
  \r
@@ -51,8 +52,10 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)
  \r
  #else\r
  \r
-namespace cv { namespace gpu { namespace imgproc \r
-{  \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace match_template \r
+{\r
      void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
      void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);\r
  \r
@@ -132,8 +135,11 @@ namespace cv { namespace gpu { namespace imgproc
                        unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);\r
  \r
      void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
+using namespace OPENCV_DEVICE_NAMESPACE_ match_template;\r
  \r
  namespace \r
  {\r
@@ -177,14 +183,14 @@ namespace
          result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
          if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))\r
          {\r
-            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
              return;\r
          }\r
  \r
          GpuMat result_;\r
          ConvolveBuf buf;\r
          convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);\r
-        imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));\r
+        extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));\r
      }\r
  \r
  \r
@@ -193,7 +199,7 @@ namespace
          if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))\r
          {\r
              result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
-            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
+            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
              return;\r
          }\r
  \r
@@ -220,15 +226,14 @@ namespace
          sqrIntegral(image.reshape(1), img_sqsum, stream);\r
  \r
          unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];\r
-        imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, \r
-                              result, image.channels(), StreamAccessor::getStream(stream));\r
+        normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
      }\r
  \r
      \r
      void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)\r
      {\r
          result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
-        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
+        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
      }\r
  \r
  \r
@@ -237,7 +242,7 @@ namespace
          if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))\r
          {\r
              result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);\r
-            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
+            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));\r
              return;\r
          }\r
  \r
@@ -247,8 +252,7 @@ namespace
          unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];\r
  \r
          matchTemplate_CCORR_8U(image, templ, result, stream);\r
-        imgproc::matchTemplatePrepared_SQDIFF_8U(\r
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
      }\r
  \r
  \r
@@ -260,8 +264,7 @@ namespace
          unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];\r
  \r
          matchTemplate_CCORR_8U(image, templ, result, stream);\r
-        imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U(\r
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));\r
      }\r
  \r
  \r
@@ -275,13 +278,12 @@ namespace
              integral(image, image_sum, stream);\r
  \r
              unsigned int templ_sum = (unsigned int)sum(templ)[0];\r
-            imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, \r
-                                                    image_sum, templ_sum, result, StreamAccessor::getStream(stream));\r
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sum, templ_sum, result, StreamAccessor::getStream(stream));\r
          }\r
          else\r
          {\r
-            std::vector<GpuMat> images;\r
-            std::vector<GpuMat> image_sums(image.channels());\r
+            vector<GpuMat> images;\r
+            vector<GpuMat> image_sums(image.channels());\r
  \r
              split(image, images);\r
              for (int i = 0; i < image.channels(); ++i)\r
@@ -292,19 +294,19 @@ namespace
              switch (image.channels())\r
              {\r
              case 2:\r
-                imgproc::matchTemplatePrepared_CCOFF_8UC2(\r
+                matchTemplatePrepared_CCOFF_8UC2(\r
                          templ.cols, templ.rows, image_sums[0], image_sums[1],\r
                          (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],\r
                          result, StreamAccessor::getStream(stream));\r
                  break;\r
              case 3:\r
-                imgproc::matchTemplatePrepared_CCOFF_8UC3(\r
+                matchTemplatePrepared_CCOFF_8UC3(\r
                          templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],\r
                          (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],\r
                          result, StreamAccessor::getStream(stream));\r
                  break;\r
              case 4:\r
-                imgproc::matchTemplatePrepared_CCOFF_8UC4(\r
+                matchTemplatePrepared_CCOFF_8UC4(\r
                          templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],\r
                          (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],\r
                          (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));\r
@@ -341,15 +343,15 @@ namespace
              unsigned int templ_sum = (unsigned int)sum(templ)[0];\r
              unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];\r
  \r
-            imgproc::matchTemplatePrepared_CCOFF_NORMED_8U(\r
+            matchTemplatePrepared_CCOFF_NORMED_8U(\r
                      templ.cols, templ.rows, image_sum, image_sqsum, \r
                      templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));\r
          }\r
          else\r
          {\r
-            std::vector<GpuMat> images;\r
-            std::vector<GpuMat> image_sums(image.channels());\r
-            std::vector<GpuMat> image_sqsums(image.channels());\r
+            vector<GpuMat> images;\r
+            vector<GpuMat> image_sums(image.channels());\r
+            vector<GpuMat> image_sqsums(image.channels());\r
  \r
              split(image, images);\r
              for (int i = 0; i < image.channels(); ++i)\r
@@ -364,7 +366,7 @@ namespace
              switch (image.channels())\r
              {\r
              case 2:\r
-                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
+                matchTemplatePrepared_CCOFF_NORMED_8UC2(\r
                          templ.cols, templ.rows, \r
                          image_sums[0], image_sqsums[0],\r
                          image_sums[1], image_sqsums[1],\r
@@ -373,7 +375,7 @@ namespace
                          result, StreamAccessor::getStream(stream));\r
                  break;\r
              case 3:\r
-                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
+                matchTemplatePrepared_CCOFF_NORMED_8UC3(\r
                          templ.cols, templ.rows, \r
                          image_sums[0], image_sqsums[0],\r
                          image_sums[1], image_sqsums[1],\r
@@ -384,7 +386,7 @@ namespace
                          result, StreamAccessor::getStream(stream));\r
                  break;\r
              case 4:\r
-                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
+                matchTemplatePrepared_CCOFF_NORMED_8UC4(\r
                          templ.cols, templ.rows, \r
                          image_sums[0], image_sqsums[0],\r
                          image_sums[1], image_sqsums[1],\r
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp

index 874e6e5..9dd832c 100644 (file)
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -45,6 +45,139 @@
  using namespace cv;\r
  using namespace cv::gpu;\r
  \r
+cv::gpu::CudaMem::CudaMem() \r
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) \r
+{\r
+}\r
+\r
+cv::gpu::CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) \r
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)\r
+{\r
+    if( _rows > 0 && _cols > 0 )\r
+        create( _rows, _cols, _type, _alloc_type);\r
+}\r
+\r
+cv::gpu::CudaMem::CudaMem(Size _size, int _type, int _alloc_type) \r
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)\r
+{\r
+    if( _size.height > 0 && _size.width > 0 )\r
+        create( _size.height, _size.width, _type, _alloc_type);\r
+}\r
+\r
+cv::gpu::CudaMem::CudaMem(const CudaMem& m) \r
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)\r
+{\r
+    if( refcount )\r
+        CV_XADD(refcount, 1);\r
+}\r
+\r
+cv::gpu::CudaMem::CudaMem(const Mat& m, int _alloc_type) \r
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)\r
+{\r
+    if( m.rows > 0 && m.cols > 0 )\r
+        create( m.size(), m.type(), _alloc_type);\r
+\r
+    Mat tmp = createMatHeader();\r
+    m.copyTo(tmp);\r
+}\r
+\r
+cv::gpu::CudaMem::~CudaMem()\r
+{\r
+    release();\r
+}\r
+\r
+CudaMem& cv::gpu::CudaMem::operator = (const CudaMem& m)\r
+{\r
+    if( this != &m )\r
+    {\r
+        if( m.refcount )\r
+            CV_XADD(m.refcount, 1);\r
+        release();\r
+        flags = m.flags;\r
+        rows = m.rows; cols = m.cols;\r
+        step = m.step; data = m.data;\r
+        datastart = m.datastart;\r
+        dataend = m.dataend;\r
+        refcount = m.refcount;\r
+        alloc_type = m.alloc_type;\r
+    }\r
+    return *this;\r
+}\r
+\r
+CudaMem cv::gpu::CudaMem::clone() const\r
+{\r
+    CudaMem m(size(), type(), alloc_type);\r
+    Mat to = m;\r
+    Mat from = *this;\r
+    from.copyTo(to);\r
+    return m;\r
+}\r
+\r
+void cv::gpu::CudaMem::create(Size _size, int _type, int _alloc_type) \r
+{ \r
+    create(_size.height, _size.width, _type, _alloc_type); \r
+}\r
+\r
+Mat cv::gpu::CudaMem::createMatHeader() const \r
+{ \r
+    return Mat(size(), type(), data, step); \r
+}\r
+\r
+cv::gpu::CudaMem::operator Mat() const \r
+{ \r
+    return createMatHeader(); \r
+}\r
+\r
+cv::gpu::CudaMem::operator GpuMat() const \r
+{ \r
+    return createGpuMatHeader(); \r
+}\r
+\r
+bool cv::gpu::CudaMem::isContinuous() const \r
+{ \r
+    return (flags & Mat::CONTINUOUS_FLAG) != 0; \r
+}\r
+\r
+size_t cv::gpu::CudaMem::elemSize() const \r
+{ \r
+    return CV_ELEM_SIZE(flags); \r
+}\r
+\r
+size_t cv::gpu::CudaMem::elemSize1() const \r
+{ \r
+    return CV_ELEM_SIZE1(flags); \r
+}\r
+\r
+int cv::gpu::CudaMem::type() const \r
+{ \r
+    return CV_MAT_TYPE(flags); \r
+}\r
+\r
+int cv::gpu::CudaMem::depth() const \r
+{ \r
+    return CV_MAT_DEPTH(flags); \r
+}\r
+\r
+int cv::gpu::CudaMem::channels() const \r
+{ \r
+    return CV_MAT_CN(flags); \r
+}\r
+\r
+size_t cv::gpu::CudaMem::step1() const \r
+{ \r
+    return step/elemSize1(); \r
+}\r
+\r
+Size cv::gpu::CudaMem::size() const \r
+{ \r
+    return Size(cols, rows); \r
+}\r
+\r
+bool cv::gpu::CudaMem::empty() const \r
+{ \r
+    return data == 0; \r
+}\r
+\r
  #if !defined (HAVE_CUDA)\r
  \r
  void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }\r
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp

index 60c3ed9..3450bd8 100644 (file)
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -190,32 +190,35 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
  ////////////////////////////////////////////////////////////////////////\r
  // Sum\r
  \r
-namespace cv { namespace gpu { namespace mathfunc\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace matrix_reductions \r
  {\r
-    template <typename T>\r
-    void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+    namespace sum\r
+    {\r
+        template <typename T>\r
+        void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
  \r
-    template <typename T>\r
-    void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+        template <typename T>\r
+        void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
  \r
-    template <typename T>\r
-    void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+        template <typename T>\r
+        void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
  \r
-    template <typename T>\r
-    void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+        template <typename T>\r
+        void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
  \r
-    template <typename T>\r
-    void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+        template <typename T>\r
+        void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
  \r
-    template <typename T>\r
-    void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
+        template <typename T>\r
+        void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);\r
  \r
-    namespace sums\r
-    {\r
          void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);\r
      }\r
-}}}\r
+}\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  Scalar cv::gpu::sum(const GpuMat& src) \r
  {\r
@@ -226,23 +229,25 @@ Scalar cv::gpu::sum(const GpuMat& src)
  \r
  Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) \r
  {\r
-    using namespace mathfunc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;\r
  \r
      typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);\r
  \r
-    static Caller multipass_callers[7] = { \r
-            sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, \r
-            sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, \r
-            sumMultipassCaller<int>, sumMultipassCaller<float>, 0 };\r
+    static Caller multipass_callers[7] = \r
+    { \r
+        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, \r
+        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, \r
+        sumMultipassCaller<int>, sumMultipassCaller<float>, 0 \r
+    };\r
  \r
      static Caller singlepass_callers[7] = { \r
-            sumCaller<unsigned char>, sumCaller<char>, \r
-            sumCaller<unsigned short>, sumCaller<short>, \r
-            sumCaller<int>, sumCaller<float>, 0 };\r
+        sumCaller<unsigned char>, sumCaller<char>, \r
+        sumCaller<unsigned short>, sumCaller<short>, \r
+        sumCaller<int>, sumCaller<float>, 0 \r
+    };\r
  \r
      Size buf_size;\r
-    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), \r
-                             buf_size.width, buf_size.height); \r
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); \r
      ensureSizeIsEnough(buf_size, CV_8U, buf);\r
  \r
      Caller* callers = multipass_callers;\r
@@ -267,23 +272,26 @@ Scalar cv::gpu::absSum(const GpuMat& src)
  \r
  Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) \r
  {\r
-    using namespace mathfunc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;\r
  \r
      typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);\r
  \r
-    static Caller multipass_callers[7] = { \r
-            absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, \r
-            absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, \r
-            absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 };\r
+    static Caller multipass_callers[7] = \r
+    { \r
+        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, \r
+        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, \r
+        absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 \r
+    };\r
  \r
-    static Caller singlepass_callers[7] = { \r
-            absSumCaller<unsigned char>, absSumCaller<char>, \r
-            absSumCaller<unsigned short>, absSumCaller<short>, \r
-            absSumCaller<int>, absSumCaller<float>, 0 };\r
+    static Caller singlepass_callers[7] = \r
+    {        \r
+        absSumCaller<unsigned char>, absSumCaller<char>, \r
+        absSumCaller<unsigned short>, absSumCaller<short>, \r
+        absSumCaller<int>, absSumCaller<float>, 0 \r
+    };\r
  \r
      Size buf_size;\r
-    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), \r
-                             buf_size.width, buf_size.height); \r
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); \r
      ensureSizeIsEnough(buf_size, CV_8U, buf);\r
  \r
      Caller* callers = multipass_callers;\r
@@ -308,27 +316,30 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
  \r
  Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) \r
  {\r
-    using namespace mathfunc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;\r
  \r
      typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);\r
  \r
-    static Caller multipass_callers[7] = { \r
-            sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, \r
-            sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, \r
-            sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 };\r
+    static Caller multipass_callers[7] = \r
+    { \r
+        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, \r
+        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, \r
+        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 \r
+    };\r
  \r
-    static Caller singlepass_callers[7] = { \r
-            sqrSumCaller<unsigned char>, sqrSumCaller<char>, \r
-            sqrSumCaller<unsigned short>, sqrSumCaller<short>, \r
-            sqrSumCaller<int>, sqrSumCaller<float>, 0 };\r
+    static Caller singlepass_callers[7] = \r
+    { \r
+        sqrSumCaller<unsigned char>, sqrSumCaller<char>, \r
+        sqrSumCaller<unsigned short>, sqrSumCaller<short>, \r
+        sqrSumCaller<int>, sqrSumCaller<float>, 0 \r
+    };\r
  \r
      Caller* callers = multipass_callers;\r
      if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))\r
          callers = singlepass_callers;\r
  \r
      Size buf_size;\r
-    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), \r
-                             buf_size.width, buf_size.height); \r
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); \r
      ensureSizeIsEnough(buf_size, CV_8U, buf);\r
  \r
      Caller caller = callers[src.depth()];\r
@@ -339,29 +350,32 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
      return Scalar(result[0], result[1], result[2], result[3]);\r
  }\r
  \r
-\r
-\r
-\r
  ////////////////////////////////////////////////////////////////////////\r
  // Find min or max\r
  \r
-namespace cv { namespace gpu { namespace mathfunc { namespace minmax {\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);\r
-    \r
-    template <typename T> \r
-    void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
+namespace matrix_reductions \r
+{\r
+    namespace minmax \r
+    {\r
+        void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);\r
+        \r
+        template <typename T> \r
+        void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
  \r
-    template <typename T> \r
-    void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
+        template <typename T> \r
+        void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
  \r
-    template <typename T> \r
-    void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
+        template <typename T> \r
+        void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);\r
  \r
-    template <typename T> \r
-    void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
+        template <typename T> \r
+        void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);\r
+    }\r
+}\r
  \r
-}}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  \r
  void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)\r
@@ -373,39 +387,43 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
  \r
  void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)\r
  {\r
-    using namespace mathfunc::minmax;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmax;\r
  \r
      typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);\r
      typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);\r
  \r
-    static Caller multipass_callers[7] = { \r
-            minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, \r
-            minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, \r
-            minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 };\r
+    static Caller multipass_callers[7] = \r
+    { \r
+        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, \r
+        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, \r
+        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 \r
+    };\r
  \r
-    static Caller singlepass_callers[7] = { \r
-            minMaxCaller<unsigned char>, minMaxCaller<char>, \r
-            minMaxCaller<unsigned short>, minMaxCaller<short>, \r
-            minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> };\r
+    static Caller singlepass_callers[7] = \r
+    { \r
+        minMaxCaller<unsigned char>, minMaxCaller<char>, \r
+        minMaxCaller<unsigned short>, minMaxCaller<short>, \r
+        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> \r
+    };\r
  \r
-    static MaskedCaller masked_multipass_callers[7] = { \r
-            minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, \r
-            minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,\r
-            minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 };\r
+    static MaskedCaller masked_multipass_callers[7] = \r
+    { \r
+        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, \r
+        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,\r
+        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0\r
+    };\r
  \r
-    static MaskedCaller masked_singlepass_callers[7] = { \r
-            minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, \r
-            minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, \r
-            minMaxMaskCaller<int>, minMaxMaskCaller<float>, \r
-            minMaxMaskCaller<double> };\r
+    static MaskedCaller masked_singlepass_callers[7] =\r
+    { \r
+        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, \r
+        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, \r
+        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> \r
+    };\r
  \r
      CV_Assert(src.channels() == 1);\r
  \r
      CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));\r
  \r
-    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && \r
-                                       DeviceInfo().supports(NATIVE_DOUBLE)));\r
-\r
      double minVal_; if (!minVal) minVal = &minVal_;\r
      double maxVal_; if (!maxVal) maxVal = &maxVal_;\r
      \r
@@ -439,28 +457,34 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
  ////////////////////////////////////////////////////////////////////////\r
  // Locate min and max\r
  \r
-namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
-                               int& b1rows, int& b2cols, int& b2rows);\r
+namespace matrix_reductions \r
+{\r
+    namespace minmaxloc \r
+    {\r
+        void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, \r
+                                int& b1rows, int& b2cols, int& b2rows);\r
  \r
-    template <typename T> \r
-    void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
-                            int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
+        template <typename T> \r
+        void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, \r
+                             int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
  \r
-    template <typename T> \r
-    void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+        template <typename T> \r
+        void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
                                   int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
  \r
-    template <typename T> \r
-    void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
-                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
+        template <typename T> \r
+        void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, \r
+                                      int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
  \r
-    template <typename T> \r
-    void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
-                                           int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
-}}}}\r
+        template <typename T> \r
+        void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, \r
+                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);\r
+    }\r
+}\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)\r
  {    \r
@@ -468,43 +492,46 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
      minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);\r
  }\r
  \r
-\r
  void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,\r
                          const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)\r
  {\r
-    using namespace mathfunc::minmaxloc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmaxloc;\r
  \r
      typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
      typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);\r
  \r
-    static Caller multipass_callers[7] = { \r
-            minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, \r
-            minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, \r
-            minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 };\r
+    static Caller multipass_callers[7] = \r
+    {\r
+        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, \r
+        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, \r
+        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 \r
+    };\r
  \r
-    static Caller singlepass_callers[7] = { \r
-            minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, \r
-            minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, \r
-            minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> };\r
+    static Caller singlepass_callers[7] = \r
+    {\r
+        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, \r
+        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, \r
+        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> \r
+    };\r
  \r
-    static MaskedCaller masked_multipass_callers[7] = { \r
-            minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>, \r
-            minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, \r
-            minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 };\r
+    static MaskedCaller masked_multipass_callers[7] = \r
+    {\r
+        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,\r
+        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, \r
+        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 \r
+    };\r
  \r
-    static MaskedCaller masked_singlepass_callers[7] = { \r
-            minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, \r
-            minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, \r
-            minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, \r
-            minMaxLocMaskCaller<double> };\r
+    static MaskedCaller masked_singlepass_callers[7] = \r
+    { \r
+        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, \r
+        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, \r
+        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double> \r
+    };\r
  \r
      CV_Assert(src.channels() == 1);\r
  \r
      CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));\r
  \r
-    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && \r
-                                       DeviceInfo().supports(NATIVE_DOUBLE)));\r
-\r
      double minVal_; if (!minVal) minVal = &minVal_;\r
      double maxVal_; if (!maxVal) maxVal = &maxVal_;\r
      int minLoc_[2];\r
@@ -544,18 +571,23 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
  //////////////////////////////////////////////////////////////////////////////\r
  // Count non-zero elements\r
  \r
-namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {\r
-\r
-    void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    template <typename T> \r
-    int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);\r
+namespace matrix_reductions \r
+{\r
+    namespace countnonzero \r
+    {\r
+        void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);\r
  \r
-    template <typename T> \r
-    int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);\r
+        template <typename T> \r
+        int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);\r
  \r
-}}}}\r
+        template <typename T> \r
+        int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);\r
+    }\r
+}\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  int cv::gpu::countNonZero(const GpuMat& src)\r
  {\r
@@ -566,26 +598,25 @@ int cv::gpu::countNonZero(const GpuMat& src)
  \r
  int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)\r
  {\r
-    using namespace mathfunc::countnonzero;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::countnonzero;\r
  \r
      typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);\r
  \r
-    static Caller multipass_callers[7] = { \r
-            countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,\r
-            countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,\r
-            countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 };\r
+    static Caller multipass_callers[7] = \r
+    {\r
+        countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,\r
+        countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,\r
+        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 \r
+    };\r
  \r
-    static Caller singlepass_callers[7] = { \r
-            countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,\r
-            countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,\r
-            countNonZeroCaller<int>, countNonZeroCaller<float>, \r
-            countNonZeroCaller<double> };\r
+    static Caller singlepass_callers[7] = \r
+    {\r
+        countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,\r
+        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,\r
+        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };\r
  \r
      CV_Assert(src.channels() == 1);\r
  \r
-    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && \r
-                                       DeviceInfo().supports(NATIVE_DOUBLE)));\r
-\r
      Size buf_size;\r
      getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);\r
      ensureSizeIsEnough(buf_size, CV_8U, buf);\r
@@ -601,15 +632,20 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
  \r
  //////////////////////////////////////////////////////////////////////////////\r
  // reduce\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-namespace cv { namespace gpu { namespace mathfunc {\r
+namespace matrix_reductions \r
+{\r
      template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
      template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)\r
  {\r
-    using namespace cv::gpu::mathfunc;\r
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions;\r
+\r
      CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);\r
      CV_Assert(dim == 0 || dim == 1);\r
      CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);\r
diff --git a/modules/gpu/src/mssegmentation.cpp b/modules/gpu/src/mssegmentation.cpp

index 609fdda..b5b7bac 100644 (file)
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@@ -234,10 +234,10 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
      const int hsp = sp;\r
  \r
      // Perform mean shift procedure and obtain region and spatial maps\r
-    GpuMat h_rmap, h_spmap;\r
-    meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria);\r
-    Mat rmap = h_rmap;\r
-    Mat spmap = h_spmap;\r
+    GpuMat d_rmap, d_spmap;\r
+    meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);\r
+    Mat rmap(d_rmap);\r
+    Mat spmap(d_spmap);\r
  \r
      Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)\r
                                          + (nrows - 1) + (ncols - 1));\r
@@ -352,7 +352,7 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
      }\r
  \r
      // Compute sum of the pixel's colors which are in the same segment\r
-    Mat h_src = src;\r
+    Mat h_src(src);\r
      vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));\r
      for (int y = 0; y < nrows; ++y)\r
      {\r
diff --git a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp

index 2888a52..bf952d8 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -43,672 +43,674 @@
  #ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__\r
  #define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "saturate_cast.hpp"\r
  #include "vec_traits.hpp"\r
  #include "vec_math.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+//////////////////////////////////////////////////////////////\r
+// BrdConstant\r
+\r
+template <typename D> struct BrdRowConstant\r
+{\r
+    typedef D result_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}\r
+\r
+    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+    {\r
+        return x >= 0 ? saturate_cast<D>(data[x]) : val;\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+    {\r
+        return x < width ? saturate_cast<D>(data[x]) : val;\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+    {\r
+        return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;\r
+    }\r
+\r
+    const int width;\r
+    const D val;\r
+};\r
+\r
+template <typename D> struct BrdColConstant\r
+{\r
+    typedef D result_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}\r
+\r
+    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+    {\r
+        return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+    {\r
+        return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+    {\r
+        return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
+    }\r
+\r
+    const int height;\r
+    const D val;\r
+};\r
+\r
+template <typename D> struct BrdConstant\r
+{\r
+    typedef D result_type;\r
+\r
+    __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) \r
+    {\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const\r
+    {\r
+        return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;\r
+    }\r
+\r
+    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const\r
+    {\r
+        return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
+    }\r
+\r
+    const int height;\r
+    const int width;\r
+    const D val;\r
+};\r
+\r
+//////////////////////////////////////////////////////////////\r
+// BrdReplicate\r
+\r
+template <typename D> struct BrdRowReplicate\r
+{\r
+    typedef D result_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}\r
+\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
+    {\r
+        return ::max(x, 0);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return ::min(x, last_col);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_low(idx_col_high(x));\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col_low(x)]);\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col_high(x)]);\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col(x)]);\r
+    }\r
+\r
+    const int last_col;\r
+};\r
+\r
+template <typename D> struct BrdColReplicate\r
+{\r
+    typedef D result_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}\r
+\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return ::max(y, 0);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return ::min(y, last_row);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row(int y) const\r
+    {\r
+        return idx_row_low(idx_row_high(y));\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));\r
+    }\r
+\r
+    const int last_row;\r
+};\r
+\r
+template <typename D> struct BrdReplicate\r
+{\r
+    typedef D result_type;\r
+\r
+    __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
+\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return ::max(y, 0);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return ::min(y, last_row);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row(int y) const\r
+    {\r
+        return idx_row_low(idx_row_high(y));\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
+    {\r
+        return ::max(x, 0);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return ::min(x, last_col);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_low(idx_col_high(x));\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+    }\r
+\r
+    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+    {\r
+        return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+    }\r
+\r
+    const int last_row;\r
+    const int last_col;\r
+};\r
+\r
+//////////////////////////////////////////////////////////////\r
+// BrdReflect101\r
+\r
+template <typename D> struct BrdRowReflect101\r
  {\r
-    //////////////////////////////////////////////////////////////\r
-    // BrdConstant\r
+    typedef D result_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}\r
  \r
-    template <typename D> struct BrdRowConstant\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
      {\r
-        typedef D result_type;\r
+        return ::abs(x) % (last_col + 1);\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
-        {\r
-            return x >= 0 ? saturate_cast<D>(data[x]) : val;\r
-        }\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_low(idx_col_high(x));\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
-        {\r
-            return x < width ? saturate_cast<D>(data[x]) : val;\r
-        }\r
+    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col_low(x)]);\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
-        {\r
-            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;\r
-        }\r
+    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col_high(x)]);\r
+    }\r
+\r
+    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col(x)]);\r
+    }\r
  \r
-        const int width;\r
-        const D val;\r
-    };\r
+    const int last_col;\r
+};\r
+\r
+template <typename D> struct BrdColReflect101\r
+{\r
+    typedef D result_type;\r
  \r
-    template <typename D> struct BrdColConstant\r
+    explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}\r
+\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return ::abs(y) % (last_row + 1);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row(int y) const\r
      {\r
-        typedef D result_type;\r
+        return idx_row_low(idx_row_high(y));\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}\r
+    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
-        {\r
-            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
-        }\r
+    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
-        {\r
-            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
-        }\r
+    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
-        {\r
-            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;\r
-        }\r
+    const int last_row;\r
+};\r
+\r
+template <typename D> struct BrdReflect101\r
+{\r
+    typedef D result_type;\r
  \r
-        const int height;\r
-        const D val;\r
-    };\r
+    __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
  \r
-    template <typename D> struct BrdConstant\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
      {\r
-        typedef D result_type;\r
+        return ::abs(y) % (last_row + 1);\r
+    }\r
  \r
-        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) \r
-        {\r
-        }\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const\r
-        {\r
-            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;\r
-        }\r
+    __device__ __forceinline__ int idx_row(int y) const\r
+    {\r
+        return idx_row_low(idx_row_high(y));\r
+    }\r
  \r
-        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const\r
-        {\r
-            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
-        }\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
+    {\r
+        return ::abs(x) % (last_col + 1);\r
+    }\r
  \r
-        const int height;\r
-        const int width;\r
-        const D val;\r
-    };\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
+    }\r
  \r
-    //////////////////////////////////////////////////////////////\r
-    // BrdReplicate\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_low(idx_col_high(x));\r
+    }\r
  \r
-    template <typename D> struct BrdRowReplicate\r
+    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
      {\r
-        typedef D result_type;\r
+        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}\r
+    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+    {\r
+        return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return ::max(x, 0);\r
-        }\r
+    const int last_row;\r
+    const int last_col;\r
+};\r
  \r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return ::min(x, last_col);\r
-        }\r
+//////////////////////////////////////////////////////////////\r
+// BrdReflect\r
  \r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_low(idx_col_high(x));\r
-        }\r
+template <typename D> struct BrdRowReflect\r
+{\r
+    typedef D result_type;\r
  \r
-        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_low(x)]);\r
-        }\r
+    explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}\r
  \r
-        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_high(x)]);\r
-        }\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
+    {\r
+        return (::abs(x) - (x < 0)) % (last_col + 1);\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col(x)]);\r
-        }\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);\r
+    }\r
  \r
-        const int last_col;\r
-    };\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_low(idx_col_high(x));\r
+    }\r
  \r
-    template <typename D> struct BrdColReplicate\r
+    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
      {\r
-        typedef D result_type;\r
+        return saturate_cast<D>(data[idx_col_low(x)]);\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}\r
+    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col_high(x)]);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return ::max(y, 0);\r
-        }\r
+    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col(x)]);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return ::min(y, last_row);\r
-        }\r
+    const int last_col;\r
+};\r
  \r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_low(idx_row_high(y));\r
-        }\r
+template <typename D> struct BrdColReflect\r
+{\r
+    typedef D result_type;\r
  \r
-        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));\r
-        }\r
+    explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}\r
  \r
-        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));\r
-        }\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return (::abs(y) - (y < 0)) % (last_row + 1);\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));\r
-        }\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);\r
+    }\r
  \r
-        const int last_row;\r
-    };\r
+    __device__ __forceinline__ int idx_row(int y) const\r
+    {\r
+        return idx_row_low(idx_row_high(y));\r
+    }\r
  \r
-    template <typename D> struct BrdReplicate\r
+    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
      {\r
-        typedef D result_type;\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
+    }\r
  \r
-        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
+    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return ::max(y, 0);\r
-        }\r
+    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return ::min(y, last_row);\r
-        }\r
+    const int last_row;\r
+};\r
  \r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_low(idx_row_high(y));\r
-        }\r
+template <typename D> struct BrdReflect\r
+{\r
+    typedef D result_type;\r
  \r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return ::max(x, 0);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return ::min(x, last_col);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_low(idx_col_high(x));\r
-        }\r
+    __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
-        }\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return (::abs(y) - (y < 0)) % (last_row + 1);\r
+    }\r
  \r
-        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
-        {\r
-            return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
-        }\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;\r
+    }\r
  \r
-        const int last_row;\r
-        const int last_col;\r
-    };\r
+    __device__ __forceinline__ int idx_row(int y) const\r
+    {\r
+        return idx_row_low(idx_row_high(y));\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
+    {\r
+        return (::abs(x) - (x < 0)) % (last_col + 1);\r
+    }\r
  \r
-    //////////////////////////////////////////////////////////////\r
-    // BrdReflect101\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return /*::abs*/(last_col - ::abs(last_col - x) + (x > last_col)) /*% (last_col + 1)*/;\r
+    }\r
  \r
-    template <typename D> struct BrdRowReflect101\r
+    __device__ __forceinline__ int idx_col(int x) const\r
      {\r
-        typedef D result_type;\r
+        return idx_col_low(idx_col_high(x));\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}\r
+    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return ::abs(x) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_low(idx_col_high(x));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_low(x)]);\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_high(x)]);\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col(x)]);\r
-        }\r
+    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
+    {\r
+        return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+    }\r
  \r
-        const int last_col;\r
-    };\r
+    const int last_row;\r
+    const int last_col;\r
+};\r
  \r
-    template <typename D> struct BrdColReflect101\r
+//////////////////////////////////////////////////////////////\r
+// BrdWrap\r
+\r
+template <typename D> struct BrdRowWrap\r
+{\r
+    typedef D result_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}\r
+\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
      {\r
-        typedef D result_type;\r
+        return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return (x < width) * x + (x >= width) * (x % width);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return ::abs(y) % (last_row + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_low(idx_row_high(y));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
-        }\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_high(idx_col_low(x));\r
+    }\r
  \r
-        const int last_row;\r
-    };\r
+    template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col_low(x)]);\r
+    }\r
  \r
-    template <typename D> struct BrdReflect101\r
+    template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
      {\r
-        typedef D result_type;\r
+        return saturate_cast<D>(data[idx_col_high(x)]);\r
+    }\r
  \r
-        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
+    template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
+    {\r
+        return saturate_cast<D>(data[idx_col(x)]);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return ::abs(y) % (last_row + 1);\r
-        }\r
+    const int width;\r
+};\r
  \r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);\r
-        }\r
+template <typename D> struct BrdColWrap\r
+{\r
+    typedef D result_type;\r
  \r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_low(idx_row_high(y));\r
-        }\r
+    explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}\r
+    template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}\r
  \r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return ::abs(x) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_low(idx_col_high(x));\r
-        }\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
-        }\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return (y < height) * y + (y >= height) * (y % height);\r
+    }\r
  \r
-        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
-        {\r
-            return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
-        }\r
+    __device__ __forceinline__ int idx_row(int y) const\r
+    {\r
+        return idx_row_high(idx_row_low(y));\r
+    }\r
  \r
-        const int last_row;\r
-        const int last_col;\r
-    };\r
+    template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
+    }\r
  \r
-    //////////////////////////////////////////////////////////////\r
-    // BrdReflect\r
+    template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
+    }\r
  \r
-    template <typename D> struct BrdRowReflect\r
+    template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
      {\r
-        typedef D result_type;\r
+        return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}\r
+    const int height;\r
+};\r
  \r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return (::abs(x) - (x < 0)) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_low(idx_col_high(x));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_low(x)]);\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_high(x)]);\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col(x)]);\r
-        }\r
+template <typename D> struct BrdWrap\r
+{\r
+    typedef D result_type;\r
  \r
-        const int last_col;\r
-    };\r
+    __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : \r
+        height(height_), width(width_) \r
+    {\r
+    }\r
+    template <typename U> \r
+    __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : \r
+        height(height_), width(width_) \r
+    {\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row_low(int y) const\r
+    {\r
+        return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_row_high(int y) const \r
+    {\r
+        return (y < height) * y + (y >= height) * (y % height);\r
+    }\r
  \r
-    template <typename D> struct BrdColReflect\r
+    __device__ __forceinline__ int idx_row(int y) const\r
      {\r
-        typedef D result_type;\r
+        return idx_row_high(idx_row_low(y));\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}\r
+    __device__ __forceinline__ int idx_col_low(int x) const\r
+    {\r
+        return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return (::abs(y) - (y < 0)) % (last_row + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_low(idx_row_high(y));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
-        }\r
+    __device__ __forceinline__ int idx_col_high(int x) const \r
+    {\r
+        return (x < width) * x + (x >= width) * (x % width);\r
+    }\r
+\r
+    __device__ __forceinline__ int idx_col(int x) const\r
+    {\r
+        return idx_col_high(idx_col_low(x));\r
+    }\r
  \r
-        const int last_row;\r
-    };\r
+    template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
+    {\r
+        return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
+    }\r
  \r
-    template <typename D> struct BrdReflect\r
+    template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
      {\r
-        typedef D result_type;\r
+        return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
+    }\r
+\r
+    const int height;\r
+    const int width;\r
+};\r
+\r
+//////////////////////////////////////////////////////////////\r
+// BorderReader\r
+\r
+template <typename Ptr2D, typename B> struct BorderReader\r
+{\r
+    typedef typename B::result_type elem_type;\r
+    typedef typename Ptr2D::index_type index_type;\r
  \r
-        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}\r
+    __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}\r
  \r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return (::abs(y) - (y < 0)) % (last_row + 1);\r
-        }\r
+    __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const\r
+    {\r
+        return b.at(y, x, ptr);\r
+    }\r
  \r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;\r
-        }\r
+    const Ptr2D ptr;\r
+    const B b;\r
+};\r
  \r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_low(idx_row_high(y));\r
-        }\r
+// under win32 there is some bug with templated types that passed as kernel parameters\r
+// with this specialization all works fine\r
+template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >\r
+{\r
+    typedef typename BrdConstant<D>::result_type elem_type;\r
+    typedef typename Ptr2D::index_type index_type;\r
  \r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return (::abs(x) - (x < 0)) % (last_col + 1);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return /*::abs*/(last_col - ::abs(last_col - x) + (x > last_col)) /*% (last_col + 1)*/;\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_low(idx_col_high(x));\r
-        }\r
+    __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) : \r
+        src(src_), height(b.height), width(b.width), val(b.val) \r
+    {\r
+    }\r
+\r
+    __device__ __forceinline__ D operator ()(index_type y, index_type x) const\r
+    {\r
+        return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
+    }\r
  \r
-        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
-        }\r
+    const Ptr2D src;\r
+    const int height;\r
+    const int width;\r
+    const D val;\r
+};\r
  \r
-        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
-        {\r
-            return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
-        }\r
-\r
-        const int last_row;\r
-        const int last_col;\r
-    };\r
-\r
-    //////////////////////////////////////////////////////////////\r
-    // BrdWrap\r
-\r
-    template <typename D> struct BrdRowWrap\r
-    {\r
-        typedef D result_type;\r
-\r
-        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}\r
-\r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return (x < width) * x + (x >= width) * (x % width);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_high(idx_col_low(x));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_low(x)]);\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col_high(x)]);\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const \r
-        {\r
-            return saturate_cast<D>(data[idx_col(x)]);\r
-        }\r
-\r
-        const int width;\r
-    };\r
-\r
-    template <typename D> struct BrdColWrap\r
-    {\r
-        typedef D result_type;\r
-\r
-        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}\r
-        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}\r
-\r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return (y < height) * y + (y >= height) * (y % height);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_high(idx_row_low(y));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));\r
-        }\r
-\r
-        const int height;\r
-    };\r
-\r
-    template <typename D> struct BrdWrap\r
-    {\r
-        typedef D result_type;\r
-\r
-        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : \r
-            height(height_), width(width_) \r
-        {\r
-        }\r
-        template <typename U> \r
-        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : \r
-            height(height_), width(width_) \r
-        {\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row_low(int y) const\r
-        {\r
-            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row_high(int y) const \r
-        {\r
-            return (y < height) * y + (y >= height) * (y % height);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_row(int y) const\r
-        {\r
-            return idx_row_high(idx_row_low(y));\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_low(int x) const\r
-        {\r
-            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col_high(int x) const \r
-        {\r
-            return (x < width) * x + (x >= width) * (x % width);\r
-        }\r
-\r
-        __device__ __forceinline__ int idx_col(int x) const\r
-        {\r
-            return idx_col_high(idx_col_low(x));\r
-        }\r
-\r
-        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const \r
-        {\r
-            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);\r
-        }\r
-\r
-        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const \r
-        {\r
-            return saturate_cast<D>(src(idx_row(y), idx_col(x)));\r
-        }\r
-\r
-        const int height;\r
-        const int width;\r
-    };\r
-\r
-    //////////////////////////////////////////////////////////////\r
-    // BorderReader\r
-\r
-    template <typename Ptr2D, typename B> struct BorderReader\r
-    {\r
-        typedef typename B::result_type elem_type;\r
-        typedef typename Ptr2D::index_type index_type;\r
-\r
-        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}\r
-\r
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const\r
-        {\r
-            return b.at(y, x, ptr);\r
-        }\r
-\r
-        const Ptr2D ptr;\r
-        const B b;\r
-    };\r
-\r
-    // under win32 there is some bug with templated types that passed as kernel parameters\r
-    // with this specialization all works fine\r
-    template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >\r
-    {\r
-        typedef typename BrdConstant<D>::result_type elem_type;\r
-        typedef typename Ptr2D::index_type index_type;\r
-\r
-        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) : \r
-            src(src_), height(b.height), width(b.width), val(b.val) \r
-        {\r
-        }\r
-\r
-        __device__ __forceinline__ D operator ()(index_type y, index_type x) const\r
-        {\r
-            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;\r
-        }\r
-\r
-        const Ptr2D src;\r
-        const int height;\r
-        const int width;\r
-        const D val;\r
-    };\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/color.hpp b/modules/gpu/src/opencv2/gpu/device/color.hpp

index f6bdde9..16b108a 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
@@ -43,179 +43,181 @@
  #ifndef __OPENCV_GPU_COLOR_HPP__\r
  #define __OPENCV_GPU_COLOR_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "detail/color_detail.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements\r
-    // template <typename T> class ColorSpace1_to_ColorSpace2_traits\r
-    // {\r
-    //     typedef ... functor_type;\r
-    //     static __host__ __device__ functor_type create_functor();\r
-    // };\r
-\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)\r
-    \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements\r
+// template <typename T> class ColorSpace1_to_ColorSpace2_traits\r
+// {\r
+//     typedef ... functor_type;\r
+//     static __host__ __device__ functor_type create_functor();\r
+// };\r
+\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)\r
+\r
  #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)\r
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)\r
-    \r
+OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)\r
+OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)\r
+\r
  #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)\r
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)\r
+OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)\r
+OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)\r
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)\r
+OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS\r
  \r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)\r
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)\r
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS\r
-}}}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp

index c8937c1..e29f003 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -45,6 +45,8 @@
  \r
  #include "internal_shared.hpp"\r
  \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
  #if defined(_WIN64) || defined(__LP64__)               \r
      // 64-bit register modifier for inlined asm\r
      #define OPENCV_GPU_ASM_PTR "l"\r
@@ -53,8 +55,6 @@
      #define OPENCV_GPU_ASM_PTR "r"\r
  #endif\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
      #if __CUDA_ARCH__ >= 200\r
  \r
          // for Fermi memory space is detected automatically\r
@@ -99,6 +99,7 @@ namespace cv { namespace gpu { namespace device
      #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B\r
          \r
      #endif // __CUDA_ARCH__ >= 200\r
-}}}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp

index ad5e815..6ccc45c 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
@@ -43,93 +43,94 @@
  #ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__\r
  #define __OPENCV_GPU_COLOR_DETAIL_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "../vec_traits.hpp"\r
  #include "../saturate_cast.hpp"\r
  #include "../limits.hpp"\r
  #include "../functional.hpp"\r
  \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
  #ifndef CV_DESCALE\r
      #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))\r
  #endif\r
  \r
-namespace cv { namespace gpu { namespace device\r
+namespace detail\r
  {\r
-    namespace detail\r
+    template<typename T> struct ColorChannel\r
      {\r
-        template<typename T> struct ColorChannel\r
-        {\r
-            typedef float worktype_f;\r
-            static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }\r
-            static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }\r
-        };\r
-        template<> struct ColorChannel<float>\r
-        {\r
-            typedef float worktype_f;\r
-            static __device__ __forceinline__ float max() { return 1.f; }\r
-            static __device__ __forceinline__ float half() { return 0.5f; }\r
-        };\r
-\r
-        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)\r
-        {\r
-        }\r
-        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)\r
-        {\r
-            vec.w = val;\r
-        }\r
-        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)\r
-        {\r
-            return ColorChannel<T>::max();\r
-        }\r
-        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)\r
-        {\r
-            return vec.w;\r
-        }\r
+        typedef float worktype_f;\r
+        static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }\r
+        static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }\r
+    };\r
+    template<> struct ColorChannel<float>\r
+    {\r
+        typedef float worktype_f;\r
+        static __device__ __forceinline__ float max() { return 1.f; }\r
+        static __device__ __forceinline__ float half() { return 0.5f; }\r
+    };\r
  \r
-        enum\r
-        {\r
-            yuv_shift  = 14,\r
-            xyz_shift  = 12,\r
-            R2Y        = 4899,\r
-            G2Y        = 9617,\r
-            B2Y        = 1868,\r
-            BLOCK_SIZE = 256\r
-        };\r
+    template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)\r
+    {\r
+    }\r
+    template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)\r
+    {\r
+        vec.w = val;\r
+    }\r
+    template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)\r
+    {\r
+        return ColorChannel<T>::max();\r
+    }\r
+    template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)\r
+    {\r
+        return vec.w;\r
      }\r
  \r
+    enum\r
+    {\r
+        yuv_shift  = 14,\r
+        xyz_shift  = 12,\r
+        R2Y        = 4899,\r
+        G2Y        = 9617,\r
+        B2Y        = 1868,\r
+        BLOCK_SIZE = 256\r
+    };\r
+}\r
+\r
  ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
      {\r
-        template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+        __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                dst.x = (&src.x)[bidx];\r
-                dst.y = src.y;\r
-                dst.z = (&src.x)[bidx^2];\r
-                setAlpha(dst, getAlpha<T>(src));\r
+            dst.x = (&src.x)[bidx];\r
+            dst.y = src.y;\r
+            dst.z = (&src.x)[bidx^2];\r
+            setAlpha(dst, getAlpha<T>(src));\r
  \r
-                return dst;\r
-            }\r
-        };\r
+            return dst;\r
+        }\r
+    };\r
  \r
-        template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>\r
+    template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>\r
+    {\r
+        __device__ uint operator()(uint src) const\r
          {\r
-            __device__ uint operator()(uint src) const\r
-            {\r
-                uint dst = 0;\r
-\r
-                dst |= (0xffu & (src >> 16));\r
-                dst |= (0xffu & (src >> 8)) << 8;\r
-                dst |= (0xffu & (src)) << 16;\r
-                dst |= (0xffu & (src >> 24)) << 24;\r
-\r
-                return dst;\r
-            }\r
-        };\r
-    }\r
+            uint dst = 0;\r
+\r
+            dst |= (0xffu & (src >> 16));\r
+            dst |= (0xffu & (src >> 8)) << 8;\r
+            dst |= (0xffu & (src)) << 16;\r
+            dst |= (0xffu & (src >> 24)) << 24;\r
+\r
+            return dst;\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -143,55 +144,55 @@ namespace cv { namespace gpu { namespace device
  \r
  /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <int green_bits, int bidx> struct RGB2RGB5x5Converter;\r
+    template<int bidx> struct RGB2RGB5x5Converter<6, bidx> \r
+    {\r
+        static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
+        {\r
+            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));\r
+        }\r
+        static __device__ __forceinline__ ushort cvt(uint src)\r
+        {\r
+            uint b = 0xffu & (src >> (bidx * 8));\r
+            uint g = 0xffu & (src >> 8);\r
+            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+            return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));\r
+        }\r
+    };\r
+    template<int bidx> struct RGB2RGB5x5Converter<5, bidx> \r
      {\r
-        template <int green_bits, int bidx> struct RGB2RGB5x5Converter;\r
-        template<int bidx> struct RGB2RGB5x5Converter<6, bidx> \r
+        static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
          {\r
-            static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
-            {\r
-                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));\r
-            }\r
-            static __device__ __forceinline__ ushort cvt(uint src)\r
-            {\r
-                uint b = 0xffu & (src >> (bidx * 8));\r
-                uint g = 0xffu & (src >> 8);\r
-                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
-                return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));\r
-            }\r
-        };\r
-        template<int bidx> struct RGB2RGB5x5Converter<5, bidx> \r
+            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));\r
+        }\r
+        static __device__ __forceinline__ ushort cvt(uint src)\r
          {\r
-            static __device__ __forceinline__ ushort cvt(const uchar3& src)\r
-            {\r
-                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));\r
-            }\r
-            static __device__ __forceinline__ ushort cvt(uint src)\r
-            {\r
-                uint b = 0xffu & (src >> (bidx * 8));\r
-                uint g = 0xffu & (src >> 8);\r
-                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
-                uint a = 0xffu & (src >> 24);\r
-                return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));\r
-            }\r
-        };\r
-\r
-        template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;\r
-        template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>\r
+            uint b = 0xffu & (src >> (bidx * 8));\r
+            uint g = 0xffu & (src >> 8);\r
+            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+            uint a = 0xffu & (src >> 24);\r
+            return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));\r
+        }\r
+    };\r
+\r
+    template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;\r
+    template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>\r
+    {\r
+        __device__ __forceinline__ ushort operator()(const uchar3& src) const\r
          {\r
-            __device__ __forceinline__ ushort operator()(const uchar3& src) const\r
-            {\r
-                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
-            }\r
-        };\r
-        template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>\r
+            return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
+        }\r
+    };\r
+    template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>\r
+    {\r
+        __device__ __forceinline__ ushort operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ ushort operator()(uint src) const\r
-            {\r
-                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
-            }\r
-        };\r
-    }\r
+            return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \\r
      struct name ## _traits \\r
@@ -203,65 +204,65 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <int green_bits, int bidx> struct RGB5x52RGBConverter;    \r
+    template <int bidx> struct RGB5x52RGBConverter<5, bidx>\r
      {\r
-        template <int green_bits, int bidx> struct RGB5x52RGBConverter;    \r
-        template <int bidx> struct RGB5x52RGBConverter<5, bidx>\r
-        {\r
-            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
-            {            \r
-                (&dst.x)[bidx] = src << 3;\r
-                dst.y = (src >> 2) & ~7;\r
-                (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;\r
-            }\r
-            static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
-            {   \r
-                dst = 0;\r
-\r
-                dst |= (0xffu & (src << 3)) << (bidx * 8);\r
-                dst |= (0xffu & ((src >> 2) & ~7)) << 8;\r
-                dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);\r
-                dst |= ((src & 0x8000) * 0xffu) << 24;\r
-            }\r
-        };\r
-        template <int bidx> struct RGB5x52RGBConverter<6, bidx>\r
-        {\r
-            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
-            {            \r
-                (&dst.x)[bidx] = src << 3;\r
-                dst.y = (src >> 3) & ~3;\r
-                (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;\r
-            }\r
-            static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
-            {           \r
-                dst = 0xffu << 24;\r
-\r
-                dst |= (0xffu & (src << 3)) << (bidx * 8);\r
-                dst |= (0xffu &((src >> 3) & ~3)) << 8;\r
-                dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);\r
-            }\r
-        };\r
-\r
-        template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;\r
-        template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>\r
+        static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
+        {            \r
+            (&dst.x)[bidx] = src << 3;\r
+            dst.y = (src >> 2) & ~7;\r
+            (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;\r
+        }\r
+        static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
+        {   \r
+            dst = 0;\r
+\r
+            dst |= (0xffu & (src << 3)) << (bidx * 8);\r
+            dst |= (0xffu & ((src >> 2) & ~7)) << 8;\r
+            dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);\r
+            dst |= ((src & 0x8000) * 0xffu) << 24;\r
+        }\r
+    };\r
+    template <int bidx> struct RGB5x52RGBConverter<6, bidx>\r
+    {\r
+        static __device__ __forceinline__ void cvt(uint src, uchar3& dst)\r
+        {            \r
+            (&dst.x)[bidx] = src << 3;\r
+            dst.y = (src >> 3) & ~3;\r
+            (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;\r
+        }\r
+        static __device__ __forceinline__ void cvt(uint src, uint& dst)\r
+        {           \r
+            dst = 0xffu << 24;\r
+\r
+            dst |= (0xffu & (src << 3)) << (bidx * 8);\r
+            dst |= (0xffu &((src >> 3) & ~3)) << 8;\r
+            dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);\r
+        }\r
+    };\r
+\r
+    template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;\r
+    template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>\r
+    {\r
+        __device__ __forceinline__ uchar3 operator()(ushort src) const\r
          {\r
-            __device__ __forceinline__ uchar3 operator()(ushort src) const\r
-            {\r
-                uchar3 dst;\r
-                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>\r
+            uchar3 dst;\r
+            RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(ushort src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(ushort src) const\r
-            {\r
-                uint dst;\r
-                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
-                return dst;\r
-            }\r
-        };\r
-    }\r
+            uint dst;\r
+            RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);\r
+            return dst;\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \\r
      struct name ## _traits \\r
@@ -275,34 +276,34 @@ namespace cv { namespace gpu { namespace device
  \r
  ///////////////////////////////// Grayscale to Color ////////////////////////////////\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>\r
      {\r
-        template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                dst.z = dst.y = dst.x = src;            \r
-                setAlpha(dst, ColorChannel<T>::max());\r
+            dst.z = dst.y = dst.x = src;            \r
+            setAlpha(dst, ColorChannel<T>::max());\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                uint dst = 0xffu << 24;\r
+            uint dst = 0xffu << 24;\r
  \r
-                dst |= src;\r
-                dst |= src << 8;\r
-                dst |= src << 16;\r
+            dst |= src;\r
+            dst |= src << 8;\r
+            dst |= src << 16;\r
  \r
-                return dst;\r
-            }\r
-        };\r
-    }\r
+            return dst;\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \\r
      template <typename T> struct name ## _traits \\r
@@ -314,33 +315,33 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <int green_bits> struct Gray2RGB5x5Converter;\r
+    template<> struct Gray2RGB5x5Converter<6> \r
      {\r
-        template <int green_bits> struct Gray2RGB5x5Converter;\r
-        template<> struct Gray2RGB5x5Converter<6> \r
+        static __device__ __forceinline__ ushort cvt(uint t)\r
          {\r
-            static __device__ __forceinline__ ushort cvt(uint t)\r
-            {\r
-                return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\r
-            }\r
-        };\r
-        template<> struct Gray2RGB5x5Converter<5> \r
+            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\r
+        }\r
+    };\r
+    template<> struct Gray2RGB5x5Converter<5> \r
+    {\r
+        static __device__ __forceinline__ ushort cvt(uint t)\r
          {\r
-            static __device__ __forceinline__ ushort cvt(uint t)\r
-            {\r
-                t >>= 3;\r
-                return (ushort)(t | (t << 5) | (t << 10));\r
-            }\r
-        };\r
-\r
-        template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>\r
+            t >>= 3;\r
+            return (ushort)(t | (t << 5) | (t << 10));\r
+        }\r
+    };\r
+\r
+    template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>\r
+    {\r
+        __device__ __forceinline__ ushort operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ ushort operator()(uint src) const\r
-            {\r
-                return Gray2RGB5x5Converter<green_bits>::cvt(src);\r
-            }\r
-        };\r
-    }\r
+            return Gray2RGB5x5Converter<green_bits>::cvt(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \\r
      struct name ## _traits \\r
@@ -354,32 +355,32 @@ namespace cv { namespace gpu { namespace device
  \r
  ///////////////////////////////// Color to Grayscale ////////////////////////////////\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <int green_bits> struct RGB5x52GrayConverter;\r
+    template <> struct RGB5x52GrayConverter<6> \r
      {\r
-        template <int green_bits> struct RGB5x52GrayConverter;\r
-        template <> struct RGB5x52GrayConverter<6> \r
+        static __device__ __forceinline__ uchar cvt(uint t)\r
          {\r
-            static __device__ __forceinline__ uchar cvt(uint t)\r
-            {\r
-                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);\r
-            }\r
-        };\r
-        template <> struct RGB5x52GrayConverter<5> \r
+            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);\r
+        }\r
+    };\r
+    template <> struct RGB5x52GrayConverter<5> \r
+    {\r
+        static __device__ __forceinline__ uchar cvt(uint t)\r
          {\r
-            static __device__ __forceinline__ uchar cvt(uint t)\r
-            {\r
-                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);\r
-            }\r
-        };   \r
+            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);\r
+        }\r
+    };   \r
  \r
-        template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>\r
+    template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>\r
+    {\r
+        __device__ __forceinline__ uchar operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uchar operator()(uint src) const\r
-            {\r
-                return RGB5x52GrayConverter<green_bits>::cvt(src);\r
-            }\r
-        };\r
-    }\r
+            return RGB5x52GrayConverter<green_bits>::cvt(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \\r
      struct name ## _traits \\r
@@ -391,39 +392,39 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)\r
      {\r
-        template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)\r
-        {\r
-            return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);\r
-        }\r
-        template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)\r
+        return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);\r
+    }\r
+    template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)\r
+    {\r
+        uint b = 0xffu & (src >> (bidx * 8));\r
+        uint g = 0xffu & (src >> 8);\r
+        uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+        return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);\r
+    }\r
+    template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)\r
+    {\r
+        return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;\r
+    }\r
+\r
+    template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>\r
+    {\r
+        __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            uint b = 0xffu & (src >> (bidx * 8));\r
-            uint g = 0xffu & (src >> 8);\r
-            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
-            return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);\r
+            return RGB2GrayConvert<bidx>(&src.x);\r
          }\r
-        template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)\r
+    };\r
+    template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>\r
+    {\r
+        __device__ __forceinline__ uchar operator()(uint src) const\r
          {\r
-            return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;\r
+            return RGB2GrayConvert<bidx>(src);\r
          }\r
-\r
-        template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>\r
-        {\r
-            __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                return RGB2GrayConvert<bidx>(&src.x);\r
-            }\r
-        };\r
-        template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>\r
-        {\r
-            __device__ __forceinline__ uchar operator()(uint src) const\r
-            {\r
-                return RGB2GrayConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -437,63 +438,63 @@ namespace cv { namespace gpu { namespace device
  \r
  ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };\r
+    __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };\r
+\r
+    template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)\r
      {\r
-        __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };\r
-        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };\r
+        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
  \r
-        template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)\r
-        {\r
-            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
+        const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
+        const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
+        const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
  \r
-            const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
-            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
-            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
+        dst.x = saturate_cast<T>(Y);\r
+        dst.y = saturate_cast<T>(Cr);\r
+        dst.z = saturate_cast<T>(Cb);\r
+    }\r
+    template <int bidx> static __device__ uint RGB2YUVConvert(uint src)\r
+    {\r
+        const uint delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
  \r
-            dst.x = saturate_cast<T>(Y);\r
-            dst.y = saturate_cast<T>(Cr);\r
-            dst.z = saturate_cast<T>(Cb);\r
-        }\r
-        template <int bidx> static __device__ uint RGB2YUVConvert(uint src)\r
-        {\r
-            const uint delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
+        const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YUVCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YUVCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
+        const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
+        const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
  \r
-            const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YUVCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YUVCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YUVCoeffs_i[bidx], yuv_shift);\r
-            const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);\r
-            const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);\r
+        uint dst = 0;\r
  \r
-            uint dst = 0;\r
+        dst |= saturate_cast<uchar>(Y);\r
+        dst |= saturate_cast<uchar>(Cr) << 8;\r
+        dst |= saturate_cast<uchar>(Cb) << 16;\r
  \r
-            dst |= saturate_cast<uchar>(Y);\r
-            dst |= saturate_cast<uchar>(Cr) << 8;\r
-            dst |= saturate_cast<uchar>(Cb) << 16;\r
+        return dst;\r
+    }\r
+    template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)\r
+    {\r
+        dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];\r
+        dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();\r
+        dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();\r
+    }\r
  \r
+    template <typename T, int scn, int dcn, int bidx> struct RGB2YUV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+        {\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
+            RGB2YUVConvert<bidx>(&src.x, dst);\r
              return dst;\r
          }\r
-        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)\r
+    };\r
+    template <int bidx> struct RGB2YUV<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator ()(uint src) const\r
          {\r
-            dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];\r
-            dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();\r
-            dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();\r
+            return RGB2YUVConvert<bidx>(src);\r
          }\r
-\r
-        template <typename T, int scn, int dcn, int bidx> struct RGB2YUV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
-        {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
-                RGB2YUVConvert<bidx>(&src.x, dst);\r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx> struct RGB2YUV<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
-        {\r
-            __device__ __forceinline__ uint operator ()(uint src) const\r
-            {\r
-                return RGB2YUVConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -505,66 +506,66 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
-    {\r
-        __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };\r
-        __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; \r
+namespace detail\r
+{\r
+    __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };\r
+    __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; \r
  \r
-        template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)\r
-        {\r
-            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
-            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
-            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
+    template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)\r
+    {\r
+        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
+        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
+        const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
  \r
-            dst[bidx] = saturate_cast<D>(b);\r
-            dst[1] = saturate_cast<D>(g);\r
-            dst[bidx^2] = saturate_cast<D>(r);\r
-        }\r
-        template <int bidx> static __device__ uint YUV2RGBConvert(uint src)\r
-        {\r
-            const int x = 0xff & (src);\r
-            const int y = 0xff & (src >> 8);\r
-            const int z = 0xff & (src >> 16);\r
-            \r
-            const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
-            const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
-            const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
+        dst[bidx] = saturate_cast<D>(b);\r
+        dst[1] = saturate_cast<D>(g);\r
+        dst[bidx^2] = saturate_cast<D>(r);\r
+    }\r
+    template <int bidx> static __device__ uint YUV2RGBConvert(uint src)\r
+    {\r
+        const int x = 0xff & (src);\r
+        const int y = 0xff & (src >> 8);\r
+        const int z = 0xff & (src >> 16);\r
+        \r
+        const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);\r
+        const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);\r
+        const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);\r
  \r
-            uint dst = 0xffu << 24;\r
+        uint dst = 0xffu << 24;\r
  \r
-            dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
-            dst |= saturate_cast<uchar>(g) << 8;\r
-            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
+        dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
+        dst |= saturate_cast<uchar>(g) << 8;\r
+        dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
  \r
-            return dst;\r
-        }\r
-        template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)\r
-        {\r
-            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];\r
-            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];\r
-            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];\r
-        }\r
+        return dst;\r
+    }\r
+    template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)\r
+    {\r
+        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];\r
+        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];\r
+        dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];\r
+    }\r
  \r
-        template <typename T, int scn, int dcn, int bidx> struct YUV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    template <typename T, int scn, int dcn, int bidx> struct YUV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                YUV2RGBConvert<bidx>(src, &dst.x);\r
-                setAlpha(dst, ColorChannel<T>::max());\r
+            YUV2RGBConvert<bidx>(src, &dst.x);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator ()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator ()(uint src) const\r
-            {\r
-                return YUV2RGBConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+            return YUV2RGBConvert<bidx>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -578,63 +579,63 @@ namespace cv { namespace gpu { namespace device
  \r
  ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////\r
      \r
-    namespace detail\r
+namespace detail\r
+{\r
+    __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};\r
+    __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};\r
+\r
+    template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)\r
      {\r
-        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};\r
-        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};\r
+        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
  \r
-        template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)\r
-        {\r
-            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);\r
+        const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
+        const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
+        const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
  \r
-            const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
-            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
-            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
+        dst.x = saturate_cast<T>(Y);\r
+        dst.y = saturate_cast<T>(Cr);\r
+        dst.z = saturate_cast<T>(Cb);\r
+    }\r
+    template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)\r
+    {\r
+        const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
  \r
-            dst.x = saturate_cast<T>(Y);\r
-            dst.y = saturate_cast<T>(Cr);\r
-            dst.z = saturate_cast<T>(Cb);\r
-        }\r
-        template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)\r
-        {\r
-            const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);\r
+        const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
+        const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
+        const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
  \r
-            const uint Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);\r
-            const uint Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);\r
-            const uint Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);\r
+        uint dst = 0;\r
  \r
-            uint dst = 0;\r
+        dst |= saturate_cast<uchar>(Y);\r
+        dst |= saturate_cast<uchar>(Cr) << 8;\r
+        dst |= saturate_cast<uchar>(Cb) << 16;\r
  \r
-            dst |= saturate_cast<uchar>(Y);\r
-            dst |= saturate_cast<uchar>(Cr) << 8;\r
-            dst |= saturate_cast<uchar>(Cb) << 16;\r
+        return dst;\r
+    }\r
+    template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)\r
+    {\r
+        dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];\r
+        dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();\r
+        dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();\r
+    }\r
  \r
+    template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
+        {\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
+            RGB2YCrCbConvert<bidx>(&src.x, dst);\r
              return dst;\r
          }\r
-        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)\r
+    };\r
+    template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator ()(uint src) const\r
          {\r
-            dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];\r
-            dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();\r
-            dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();\r
+            return RGB2YCrCbConvert<bidx>(src);\r
          }\r
-\r
-        template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
-        {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
-                RGB2YCrCbConvert<bidx>(&src.x, dst);\r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
-        {\r
-            __device__ __forceinline__ uint operator ()(uint src) const\r
-            {\r
-                return RGB2YCrCbConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -646,66 +647,66 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
-    {\r
-        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};\r
-        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};\r
+namespace detail\r
+{\r
+    __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};\r
+    __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};\r
  \r
-        template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)\r
-        {\r
-            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
-            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
-            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
+    template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)\r
+    {\r
+        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
+        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
+        const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
  \r
-            dst[bidx] = saturate_cast<D>(b);\r
-            dst[1] = saturate_cast<D>(g);\r
-            dst[bidx^2] = saturate_cast<D>(r);\r
-        }\r
-        template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)\r
-        {\r
-            const int x = 0xff & (src);\r
-            const int y = 0xff & (src >> 8);\r
-            const int z = 0xff & (src >> 16);\r
-            \r
-            const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
-            const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
-            const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
+        dst[bidx] = saturate_cast<D>(b);\r
+        dst[1] = saturate_cast<D>(g);\r
+        dst[bidx^2] = saturate_cast<D>(r);\r
+    }\r
+    template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)\r
+    {\r
+        const int x = 0xff & (src);\r
+        const int y = 0xff & (src >> 8);\r
+        const int z = 0xff & (src >> 16);\r
+        \r
+        const uint b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);\r
+        const uint g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);\r
+        const uint r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);\r
  \r
-            uint dst = 0xffu << 24;\r
+        uint dst = 0xffu << 24;\r
  \r
-            dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
-            dst |= saturate_cast<uchar>(g) << 8;\r
-            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
+        dst |= saturate_cast<uchar>(b) << (bidx * 8);\r
+        dst |= saturate_cast<uchar>(g) << 8;\r
+        dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);\r
  \r
-            return dst;\r
-        }\r
-        template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)\r
-        {\r
-            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];\r
-            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];\r
-            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];\r
-        }\r
+        return dst;\r
+    }\r
+    template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)\r
+    {\r
+        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];\r
+        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];\r
+        dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];\r
+    }\r
  \r
-        template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                YCrCb2RGBConvert<bidx>(src, &dst.x);\r
-                setAlpha(dst, ColorChannel<T>::max());\r
+            YCrCb2RGBConvert<bidx>(src, &dst.x);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator ()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator ()(uint src) const\r
-            {\r
-                return YCrCb2RGBConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+            return YCrCb2RGBConvert<bidx>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -719,61 +720,61 @@ namespace cv { namespace gpu { namespace device
  \r
  ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////\r
  \r
-    namespace detail\r
-    {\r
-        __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };\r
-        __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };\r
+namespace detail\r
+{\r
+    __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };\r
+    __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };\r
  \r
-        template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)\r
-        {\r
-            dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));\r
-            dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));\r
-            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));\r
-        }\r
-        template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)\r
-        {\r
-            const uint b = 0xffu & (src >> (bidx * 8));\r
-            const uint g = 0xffu & (src >> 8);\r
-            const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
+    template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)\r
+    {\r
+        dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));\r
+        dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));\r
+        dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));\r
+    }\r
+    template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)\r
+    {\r
+        const uint b = 0xffu & (src >> (bidx * 8));\r
+        const uint g = 0xffu & (src >> 8);\r
+        const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));\r
  \r
-            const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));\r
-            const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));\r
-            const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));\r
+        const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));\r
+        const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));\r
+        const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));\r
  \r
-            uint dst = 0;\r
+        uint dst = 0;\r
  \r
-            dst |= x;\r
-            dst |= y << 8;\r
-            dst |= z << 16;\r
+        dst |= x;\r
+        dst |= y << 8;\r
+        dst |= z << 16;\r
  \r
-            return dst;\r
-        }\r
-        template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)\r
-        {\r
-            dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];\r
-            dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];\r
-            dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];\r
-        }\r
+        return dst;\r
+    }\r
+    template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)\r
+    {\r
+        dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];\r
+        dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];\r
+        dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];\r
+    }\r
  \r
-        template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                RGB2XYZConvert<bidx>(&src.x, dst);\r
+            RGB2XYZConvert<bidx>(&src.x, dst);\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                return RGB2XYZConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+            return RGB2XYZConvert<bidx>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -785,62 +786,62 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
-    {\r
-        __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };\r
-        __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };\r
+namespace detail\r
+{\r
+    __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };\r
+    __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };\r
  \r
-        template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)\r
-        {\r
-            dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
-               dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
-               dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
-        }\r
-        template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)\r
-        {\r
-            const int x = 0xff & src;\r
-            const int y = 0xff & (src >> 8);\r
-            const int z = 0xff & (src >> 16);\r
+    template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)\r
+    {\r
+        dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
+        dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
+        dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
+    }\r
+    template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)\r
+    {\r
+        const int x = 0xff & src;\r
+        const int y = 0xff & (src >> 8);\r
+        const int z = 0xff & (src >> 16);\r
  \r
-            const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
-               const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
-               const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
+        const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));\r
+        const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));\r
+        const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));\r
  \r
-            uint dst = 0xffu << 24;\r
+        uint dst = 0xffu << 24;\r
  \r
-            dst |= b << (bidx * 8);\r
-            dst |= g << 8;\r
-            dst |= r << ((bidx ^ 2) * 8);\r
+        dst |= b << (bidx * 8);\r
+        dst |= g << 8;\r
+        dst |= r << ((bidx ^ 2) * 8);\r
  \r
-            return dst;\r
-        }\r
-        template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)\r
-        {\r
-            dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];\r
-               dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];\r
-               dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];\r
-        }\r
+        return dst;\r
+    }\r
+    template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)\r
+    {\r
+        dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];\r
+        dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];\r
+        dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];\r
+    }\r
  \r
-        template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                XYZ2RGBConvert<bidx>(src, &dst.x);\r
-                setAlpha(dst, ColorChannel<T>::max());\r
+            XYZ2RGBConvert<bidx>(src, &dst.x);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                return XYZ2RGBConvert<bidx>(src);\r
-            }\r
-        };\r
-    }\r
+            return XYZ2RGBConvert<bidx>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -854,123 +855,123 @@ namespace cv { namespace gpu { namespace device
  \r
  ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////\r
  \r
-    namespace detail\r
-    {\r
-        __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};\r
-        __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};\r
-        __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};\r
-\r
-        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)\r
-        {\r
-            const int hsv_shift = 12;\r
-            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
-\r
-            int b = src[bidx], g = src[1], r = src[bidx^2];\r
-            int h, s, v = b;\r
-            int vmin = b, diff;\r
-            int vr, vg;\r
-\r
-            v = ::max(v, g);\r
-            v = ::max(v, r);\r
-            vmin = ::min(vmin, g);\r
-            vmin = ::min(vmin, r);\r
-\r
-            diff = v - vmin;\r
-            vr = (v == r) * -1;\r
-            vg = (v == g) * -1;\r
-\r
-            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
-            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
-            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
-            h += (h < 0) * hr;\r
-\r
-            dst.x = saturate_cast<uchar>(h);\r
-            dst.y = (uchar)s;\r
-            dst.z = (uchar)v;\r
-        }\r
-        template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)\r
-        {\r
-            const int hsv_shift = 12;\r
-            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
-\r
-            const int b = 0xff & (src >> (bidx * 8));\r
-            const int g = 0xff & (src >> 8);\r
-            const int r = 0xff & (src >> ((bidx ^ 2) * 8));\r
-            \r
-            int h, s, v = b;\r
-            int vmin = b, diff;\r
-            int vr, vg;\r
-\r
-            v = ::max(v, g);\r
-            v = ::max(v, r);\r
-            vmin = ::min(vmin, g);\r
-            vmin = ::min(vmin, r);\r
-\r
-            diff = v - vmin;\r
-            vr = (v == r) * -1;\r
-            vg = (v == g) * -1;\r
-\r
-            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
-            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
-            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
-            h += (h < 0) * hr;\r
-\r
-            uint dst = 0;\r
-\r
-            dst |= saturate_cast<uchar>(h);\r
-            dst |= (0xffu & s) << 8;\r
-            dst |= (0xffu & v) << 16;\r
+namespace detail\r
+{\r
+    __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};\r
+    __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};\r
+    __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};\r
  \r
-            return dst;\r
-        }\r
-        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)\r
-        {\r
-            const float hscale = hr * (1.f / 360.f);\r
+    template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)\r
+    {\r
+        const int hsv_shift = 12;\r
+        const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
+\r
+        int b = src[bidx], g = src[1], r = src[bidx^2];\r
+        int h, s, v = b;\r
+        int vmin = b, diff;\r
+        int vr, vg;\r
+\r
+        v = ::max(v, g);\r
+        v = ::max(v, r);\r
+        vmin = ::min(vmin, g);\r
+        vmin = ::min(vmin, r);\r
+\r
+        diff = v - vmin;\r
+        vr = (v == r) * -1;\r
+        vg = (v == g) * -1;\r
+\r
+        s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+        h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
+        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+        h += (h < 0) * hr;\r
+\r
+        dst.x = saturate_cast<uchar>(h);\r
+        dst.y = (uchar)s;\r
+        dst.z = (uchar)v;\r
+    }\r
+    template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)\r
+    {\r
+        const int hsv_shift = 12;\r
+        const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;\r
+\r
+        const int b = 0xff & (src >> (bidx * 8));\r
+        const int g = 0xff & (src >> 8);\r
+        const int r = 0xff & (src >> ((bidx ^ 2) * 8));\r
+        \r
+        int h, s, v = b;\r
+        int vmin = b, diff;\r
+        int vr, vg;\r
+\r
+        v = ::max(v, g);\r
+        v = ::max(v, r);\r
+        vmin = ::min(vmin, g);\r
+        vmin = ::min(vmin, r);\r
+\r
+        diff = v - vmin;\r
+        vr = (v == r) * -1;\r
+        vg = (v == g) * -1;\r
+\r
+        s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+        h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));\r
+        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;\r
+        h += (h < 0) * hr;\r
+\r
+        uint dst = 0;\r
+\r
+        dst |= saturate_cast<uchar>(h);\r
+        dst |= (0xffu & s) << 8;\r
+        dst |= (0xffu & v) << 16;\r
+\r
+        return dst;\r
+    }\r
+    template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)\r
+    {\r
+        const float hscale = hr * (1.f / 360.f);\r
  \r
-            float b = src[bidx], g = src[1], r = src[bidx^2];\r
-            float h, s, v;\r
+        float b = src[bidx], g = src[1], r = src[bidx^2];\r
+        float h, s, v;\r
  \r
-            float vmin, diff;\r
+        float vmin, diff;\r
  \r
-            v = vmin = r;\r
-            v = fmax(v, g);\r
-            v = fmax(v, b);\r
-            vmin = fmin(vmin, g);\r
-            vmin = fmin(vmin, b);\r
+        v = vmin = r;\r
+        v = fmax(v, g);\r
+        v = fmax(v, b);\r
+        vmin = fmin(vmin, g);\r
+        vmin = fmin(vmin, b);\r
  \r
-            diff = v - vmin;\r
-            s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());\r
-            diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));\r
+        diff = v - vmin;\r
+        s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());\r
+        diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));\r
  \r
-            h  = (v == r) * (g - b) * diff;\r
-            h += (v != r && v == g) * ((b - r) * diff + 120.f);\r
-            h += (v != r && v != g) * ((r - g) * diff + 240.f);\r
-            h += (h < 0) * 360.f;\r
+        h  = (v == r) * (g - b) * diff;\r
+        h += (v != r && v == g) * ((b - r) * diff + 120.f);\r
+        h += (v != r && v != g) * ((r - g) * diff + 240.f);\r
+        h += (h < 0) * 360.f;\r
  \r
-            dst.x = h * hscale;\r
-            dst.y = s;\r
-            dst.z = v;\r
-        }\r
+        dst.x = h * hscale;\r
+        dst.y = s;\r
+        dst.z = v;\r
+    }\r
  \r
-        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                RGB2HSVConvert<bidx, hr>(&src.x, dst);\r
+            RGB2HSVConvert<bidx, hr>(&src.x, dst);\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                return RGB2HSVConvert<bidx, hr>(src);\r
-            }\r
-        };\r
-    }\r
+            return RGB2HSVConvert<bidx, hr>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -1006,97 +1007,97 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
+\r
+    template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)\r
      {\r
-        __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
+        const float hscale = 6.f / hr;\r
+        \r
+        float h = src.x, s = src.y, v = src.z;\r
+        float b = v, g = v, r = v;\r
  \r
-        template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)\r
+        if (s != 0)\r
          {\r
-            const float hscale = 6.f / hr;\r
-            \r
-            float h = src.x, s = src.y, v = src.z;\r
-            float b = v, g = v, r = v;\r
-\r
-            if (s != 0)\r
-            {\r
-                h *= hscale;\r
-\r
-                if( h < 0 )\r
-                    do h += 6; while( h < 0 );\r
-                else if( h >= 6 )\r
-                    do h -= 6; while( h >= 6 );\r
-\r
-                int sector = __float2int_rd(h);\r
-                h -= sector;\r
-\r
-                float tab[4];\r
-                tab[0] = v;\r
-                tab[1] = v * (1.f - s);\r
-                tab[2] = v * (1.f - s * h);\r
-                tab[3] = v * (1.f - s * (1.f - h));\r
-\r
-                b = tab[c_HsvSectorData[sector][0]];\r
-                g = tab[c_HsvSectorData[sector][1]];\r
-                r = tab[c_HsvSectorData[sector][2]];\r
-            }\r
-\r
-            dst[bidx] = b;\r
-            dst[1] = g;\r
-            dst[bidx^2] = r;\r
-        }\r
-        template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)\r
-        {\r
-            float3 buf;\r
+            h *= hscale;\r
+\r
+            if( h < 0 )\r
+                do h += 6; while( h < 0 );\r
+            else if( h >= 6 )\r
+                do h -= 6; while( h >= 6 );\r
  \r
-            buf.x = src.x;\r
-            buf.y = src.y * (1.f / 255.f);\r
-            buf.z = src.z * (1.f / 255.f);\r
+            int sector = __float2int_rd(h);\r
+            h -= sector;\r
  \r
-            HSV2RGBConvert<bidx, HR>(buf, &buf.x);\r
+            float tab[4];\r
+            tab[0] = v;\r
+            tab[1] = v * (1.f - s);\r
+            tab[2] = v * (1.f - s * h);\r
+            tab[3] = v * (1.f - s * (1.f - h));\r
  \r
-            dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
-            dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
-            dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
+            b = tab[c_HsvSectorData[sector][0]];\r
+            g = tab[c_HsvSectorData[sector][1]];\r
+            r = tab[c_HsvSectorData[sector][2]];\r
          }\r
-        template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)\r
-        {\r
-            float3 buf;\r
  \r
-            buf.x = src & 0xff;\r
-            buf.y = ((src >> 8) & 0xff) * (1.f/255.f);\r
-            buf.z = ((src >> 16) & 0xff) * (1.f/255.f);\r
+        dst[bidx] = b;\r
+        dst[1] = g;\r
+        dst[bidx^2] = r;\r
+    }\r
+    template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)\r
+    {\r
+        float3 buf;\r
  \r
-            HSV2RGBConvert<bidx, hr>(buf, &buf.x);\r
+        buf.x = src.x;\r
+        buf.y = src.y * (1.f / 255.f);\r
+        buf.z = src.z * (1.f / 255.f);\r
  \r
-            uint dst = 0xffu << 24;\r
+        HSV2RGBConvert<bidx, HR>(buf, &buf.x);\r
+\r
+        dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
+        dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
+        dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
+    }\r
+    template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)\r
+    {\r
+        float3 buf;\r
  \r
-            dst |= saturate_cast<uchar>(buf.x * 255.f);\r
-            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
-            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+        buf.x = src & 0xff;\r
+        buf.y = ((src >> 8) & 0xff) * (1.f/255.f);\r
+        buf.z = ((src >> 16) & 0xff) * (1.f/255.f);\r
  \r
-            return dst;\r
-        }\r
+        HSV2RGBConvert<bidx, hr>(buf, &buf.x);\r
  \r
-        template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+        uint dst = 0xffu << 24;\r
+\r
+        dst |= saturate_cast<uchar>(buf.x * 255.f);\r
+        dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
+        dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+\r
+        return dst;\r
+    }\r
+\r
+    template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                HSV2RGBConvert<bidx, hr>(src, &dst.x);\r
-                setAlpha(dst, ColorChannel<T>::max());\r
+            HSV2RGBConvert<bidx, hr>(src, &dst.x);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                return HSV2RGBConvert<bidx, hr>(src);\r
-            }\r
-        };\r
-    }\r
+            return HSV2RGBConvert<bidx, hr>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -1134,94 +1135,94 @@ namespace cv { namespace gpu { namespace device
  \r
  /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////\r
  \r
-    namespace detail\r
+namespace detail\r
+{\r
+    template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)\r
      {\r
-        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)\r
-        {\r
-            const float hscale = hr * (1.f / 360.f);\r
-\r
-            float b = src[bidx], g = src[1], r = src[bidx^2];\r
-            float h = 0.f, s = 0.f, l;\r
-            float vmin, vmax, diff;\r
+        const float hscale = hr * (1.f / 360.f);\r
  \r
-            vmax = vmin = r;\r
-            vmax = fmax(vmax, g);\r
-            vmax = fmax(vmax, b);\r
-            vmin = fmin(vmin, g);\r
-            vmin = fmin(vmin, b);\r
+        float b = src[bidx], g = src[1], r = src[bidx^2];\r
+        float h = 0.f, s = 0.f, l;\r
+        float vmin, vmax, diff;\r
  \r
-            diff = vmax - vmin;\r
-            l = (vmax + vmin) * 0.5f;\r
+        vmax = vmin = r;\r
+        vmax = fmax(vmax, g);\r
+        vmax = fmax(vmax, b);\r
+        vmin = fmin(vmin, g);\r
+        vmin = fmin(vmin, b);\r
  \r
-            if (diff > numeric_limits<float>::epsilon())\r
-            {\r
-                s = (l < 0.5f) * diff / (vmax + vmin);\r
-                s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);\r
+        diff = vmax - vmin;\r
+        l = (vmax + vmin) * 0.5f;\r
  \r
-                diff = 60.f / diff;\r
+        if (diff > numeric_limits<float>::epsilon())\r
+        {\r
+            s = (l < 0.5f) * diff / (vmax + vmin);\r
+            s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);\r
  \r
-                h  = (vmax == r) * (g - b) * diff;\r
-                h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);\r
-                h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);\r
-                h += (h < 0.f) * 360.f;\r
-            }\r
+            diff = 60.f / diff;\r
  \r
-            dst.x = h * hscale;\r
-            dst.y = l;\r
-            dst.z = s;\r
+            h  = (vmax == r) * (g - b) * diff;\r
+            h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);\r
+            h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);\r
+            h += (h < 0.f) * 360.f;\r
          }\r
-        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)\r
-        {\r
-            float3 buf;\r
  \r
-            buf.x = src[0] * (1.f / 255.f);\r
-            buf.y = src[1] * (1.f / 255.f);\r
-            buf.z = src[2] * (1.f / 255.f);\r
+        dst.x = h * hscale;\r
+        dst.y = l;\r
+        dst.z = s;\r
+    }\r
+    template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)\r
+    {\r
+        float3 buf;\r
  \r
-            RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
+        buf.x = src[0] * (1.f / 255.f);\r
+        buf.y = src[1] * (1.f / 255.f);\r
+        buf.z = src[2] * (1.f / 255.f);\r
  \r
-            dst.x = saturate_cast<uchar>(buf.x);\r
-            dst.y = saturate_cast<uchar>(buf.y*255.f);\r
-            dst.z = saturate_cast<uchar>(buf.z*255.f);\r
-        }\r
-        template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)\r
-        {\r
-            float3 buf;\r
+        RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
+\r
+        dst.x = saturate_cast<uchar>(buf.x);\r
+        dst.y = saturate_cast<uchar>(buf.y*255.f);\r
+        dst.z = saturate_cast<uchar>(buf.z*255.f);\r
+    }\r
+    template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)\r
+    {\r
+        float3 buf;\r
  \r
-            buf.x = (0xff & src) * (1.f / 255.f);\r
-            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
-            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
+        buf.x = (0xff & src) * (1.f / 255.f);\r
+        buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
+        buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
  \r
-            RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
+        RGB2HLSConvert<bidx, hr>(&buf.x, buf);\r
  \r
-            uint dst = 0xffu << 24;\r
+        uint dst = 0xffu << 24;\r
  \r
-            dst |= saturate_cast<uchar>(buf.x);\r
-            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
-            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+        dst |= saturate_cast<uchar>(buf.x);\r
+        dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
+        dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
  \r
-            return dst;\r
-        }\r
+        return dst;\r
+    }\r
  \r
-        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                RGB2HLSConvert<bidx, hr>(&src.x, dst);\r
+            RGB2HLSConvert<bidx, hr>(&src.x, dst);\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                return RGB2HLSConvert<bidx, hr>(src);\r
-            }\r
-        };\r
-    }\r
+            return RGB2HLSConvert<bidx, hr>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -1257,103 +1258,103 @@ namespace cv { namespace gpu { namespace device
          } \\r
      };\r
  \r
-    namespace detail\r
-    {\r
-        __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
-\r
-        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)\r
-        {\r
-            const float hscale = 6.0f / hr;\r
+namespace detail\r
+{\r
+    __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };\r
  \r
-            float h = src.x, l = src.y, s = src.z;\r
-            float b = l, g = l, r = l;\r
+    template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)\r
+    {\r
+        const float hscale = 6.0f / hr;\r
  \r
-            if (s != 0)\r
-            {\r
-                float p2  = (l <= 0.5f) * l * (1 + s);\r
-                      p2 += (l > 0.5f) * (l + s - l * s);\r
-                float p1 = 2 * l - p2;\r
+        float h = src.x, l = src.y, s = src.z;\r
+        float b = l, g = l, r = l;\r
  \r
-                h *= hscale;\r
+        if (s != 0)\r
+        {\r
+            float p2  = (l <= 0.5f) * l * (1 + s);\r
+                  p2 += (l > 0.5f) * (l + s - l * s);\r
+            float p1 = 2 * l - p2;\r
  \r
-                if( h < 0 )\r
-                    do h += 6; while( h < 0 );\r
-                else if( h >= 6 )\r
-                    do h -= 6; while( h >= 6 );\r
+            h *= hscale;\r
  \r
-                int sector;\r
-                sector = __float2int_rd(h);\r
+            if( h < 0 )\r
+                do h += 6; while( h < 0 );\r
+            else if( h >= 6 )\r
+                do h -= 6; while( h >= 6 );\r
  \r
-                h -= sector;\r
+            int sector;\r
+            sector = __float2int_rd(h);\r
  \r
-                float tab[4];\r
-                tab[0] = p2;\r
-                tab[1] = p1;\r
-                tab[2] = p1 + (p2 - p1) * (1 - h);\r
-                tab[3] = p1 + (p2 - p1) * h;\r
+            h -= sector;\r
  \r
-                b = tab[c_HlsSectorData[sector][0]];\r
-                g = tab[c_HlsSectorData[sector][1]];\r
-                r = tab[c_HlsSectorData[sector][2]];\r
-            }\r
+            float tab[4];\r
+            tab[0] = p2;\r
+            tab[1] = p1;\r
+            tab[2] = p1 + (p2 - p1) * (1 - h);\r
+            tab[3] = p1 + (p2 - p1) * h;\r
  \r
-            dst[bidx] = b;\r
-            dst[1] = g;\r
-            dst[bidx^2] = r;\r
+            b = tab[c_HlsSectorData[sector][0]];\r
+            g = tab[c_HlsSectorData[sector][1]];\r
+            r = tab[c_HlsSectorData[sector][2]];\r
          }\r
-        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)\r
-        {\r
-            float3 buf;\r
  \r
-            buf.x = src.x;\r
-            buf.y = src.y * (1.f / 255.f);\r
-            buf.z = src.z * (1.f / 255.f);\r
+        dst[bidx] = b;\r
+        dst[1] = g;\r
+        dst[bidx^2] = r;\r
+    }\r
+    template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)\r
+    {\r
+        float3 buf;\r
  \r
-            HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
+        buf.x = src.x;\r
+        buf.y = src.y * (1.f / 255.f);\r
+        buf.z = src.z * (1.f / 255.f);\r
  \r
-            dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
-            dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
-            dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
-        }\r
-        template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)\r
-        {\r
-            float3 buf;\r
+        HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
  \r
-            buf.x = 0xff & src;\r
-            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
-            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
+        dst[0] = saturate_cast<uchar>(buf.x * 255.f);\r
+        dst[1] = saturate_cast<uchar>(buf.y * 255.f);\r
+        dst[2] = saturate_cast<uchar>(buf.z * 255.f);\r
+    }\r
+    template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)\r
+    {\r
+        float3 buf;\r
  \r
-            HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
+        buf.x = 0xff & src;\r
+        buf.y = (0xff & (src >> 8)) * (1.f / 255.f);\r
+        buf.z = (0xff & (src >> 16)) * (1.f / 255.f);\r
  \r
-            uint dst = 0xffu << 24;\r
+        HLS2RGBConvert<bidx, hr>(buf, &buf.x);\r
  \r
-            dst |= saturate_cast<uchar>(buf.x * 255.f);\r
-            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
-            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
+        uint dst = 0xffu << 24;\r
  \r
-            return dst;\r
-        }\r
+        dst |= saturate_cast<uchar>(buf.x * 255.f);\r
+        dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;\r
+        dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;\r
  \r
-        template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+        return dst;\r
+    }\r
+\r
+    template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>\r
+    {\r
+        __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
          {\r
-            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const\r
-            {\r
-                typename TypeVec<T, dcn>::vec_type dst;\r
+            typename TypeVec<T, dcn>::vec_type dst;\r
  \r
-                HLS2RGBConvert<bidx, hr>(src, &dst.x);\r
-                setAlpha(dst, ColorChannel<T>::max());\r
+            HLS2RGBConvert<bidx, hr>(src, &dst.x);\r
+            setAlpha(dst, ColorChannel<T>::max());\r
  \r
-                return dst;\r
-            }\r
-        };\r
-        template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+            return dst;\r
+        }\r
+    };\r
+    template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>\r
+    {\r
+        __device__ __forceinline__ uint operator()(uint src) const\r
          {\r
-            __device__ __forceinline__ uint operator()(uint src) const\r
-            {\r
-                return HLS2RGBConvert<bidx, hr>(src);\r
-            }\r
-        };\r
-    }\r
+            return HLS2RGBConvert<bidx, hr>(src);\r
+        }\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \\r
      template <typename T> struct name ## _traits \\r
@@ -1388,6 +1389,7 @@ namespace cv { namespace gpu { namespace device
              return functor_type(); \\r
          } \\r
      };\r
-}}}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_COLOR_DETAIL_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp

index 212dcbc..4e16be9 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
@@ -47,364 +47,365 @@
  #include "../vec_traits.hpp"\r
  #include "../functional.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace detail\r
  {\r
-    namespace detail\r
-    {\r
-        //! Mask accessor\r
+    //! Mask accessor\r
  \r
-        struct MaskReader\r
-        {\r
-            explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}\r
+    struct MaskReader\r
+    {\r
+        explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}\r
  \r
-            __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }\r
+        __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }\r
  \r
-            const PtrStepb mask;\r
-        };\r
+        const PtrStepb mask;\r
+    };\r
  \r
-        struct NoMask \r
-        {\r
-            __device__ __forceinline__ bool operator()(int y, int x) const { return true; } \r
-        };\r
+    struct NoMask \r
+    {\r
+        __device__ __forceinline__ bool operator()(int y, int x) const { return true; } \r
+    };\r
  \r
-        //! Read Write Traits\r
+    //! Read Write Traits\r
  \r
-        template <typename T, typename D, int shift> struct UnaryReadWriteTraits\r
-        {\r
-            typedef typename TypeVec<T, shift>::vec_type read_type;\r
-            typedef typename TypeVec<D, shift>::vec_type write_type;\r
-        };\r
+    template <typename T, typename D, int shift> struct UnaryReadWriteTraits\r
+    {\r
+        typedef typename TypeVec<T, shift>::vec_type read_type;\r
+        typedef typename TypeVec<D, shift>::vec_type write_type;\r
+    };\r
  \r
-        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits\r
-        {\r
-            typedef typename TypeVec<T1, shift>::vec_type read_type1;\r
-            typedef typename TypeVec<T2, shift>::vec_type read_type2;\r
-            typedef typename TypeVec<D, shift>::vec_type write_type;\r
-        };\r
+    template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits\r
+    {\r
+        typedef typename TypeVec<T1, shift>::vec_type read_type1;\r
+        typedef typename TypeVec<T2, shift>::vec_type read_type2;\r
+        typedef typename TypeVec<D, shift>::vec_type write_type;\r
+    };\r
  \r
-        //! Transform kernels\r
+    //! Transform kernels\r
  \r
-        template <int shift> struct OpUnroller;\r
-        template <> struct OpUnroller<1>\r
+    template <int shift> struct OpUnroller;\r
+    template <> struct OpUnroller<1>\r
+    {\r
+        template <typename T, typename D, typename UnOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
          {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src.x);\r
-            }\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src.x);\r
+        }\r
  \r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src1.x, src2.x);\r
-            }\r
-        };\r
-        template <> struct OpUnroller<2>\r
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
          {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src.x);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.y = op(src.y);\r
-            }\r
-\r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src1.x, src2.x);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.y = op(src1.y, src2.y);\r
-            }\r
-        };\r
-        template <> struct OpUnroller<3>\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src1.x, src2.x);\r
+        }\r
+    };\r
+    template <> struct OpUnroller<2>\r
+    {\r
+        template <typename T, typename D, typename UnOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)\r
          {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src.x);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.y = op(src.y);\r
-                if (mask(y, x_shifted + 2))\r
-                    dst.z = op(src.z);\r
-            }\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src.x);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.y = op(src.y);\r
+        }\r
  \r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src1.x, src2.x);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.y = op(src1.y, src2.y);\r
-                if (mask(y, x_shifted + 2))\r
-                    dst.z = op(src1.z, src2.z);\r
-            }\r
-        };\r
-        template <> struct OpUnroller<4>\r
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)\r
          {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src.x);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.y = op(src.y);\r
-                if (mask(y, x_shifted + 2))\r
-                    dst.z = op(src.z);\r
-                if (mask(y, x_shifted + 3))\r
-                    dst.w = op(src.w);\r
-            }\r
-\r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.x = op(src1.x, src2.x);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.y = op(src1.y, src2.y);\r
-                if (mask(y, x_shifted + 2))\r
-                    dst.z = op(src1.z, src2.z);\r
-                if (mask(y, x_shifted + 3))\r
-                    dst.w = op(src1.w, src2.w);\r
-            }\r
-        };\r
-        template <> struct OpUnroller<8>\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src1.x, src2.x);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.y = op(src1.y, src2.y);\r
+        }\r
+    };\r
+    template <> struct OpUnroller<3>\r
+    {\r
+        template <typename T, typename D, typename UnOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
          {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.a0 = op(src.a0);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.a1 = op(src.a1);\r
-                if (mask(y, x_shifted + 2))\r
-                    dst.a2 = op(src.a2);\r
-                if (mask(y, x_shifted + 3))\r
-                    dst.a3 = op(src.a3);\r
-                if (mask(y, x_shifted + 4))\r
-                    dst.a4 = op(src.a4);\r
-                if (mask(y, x_shifted + 5))\r
-                    dst.a5 = op(src.a5);\r
-                if (mask(y, x_shifted + 6))\r
-                    dst.a6 = op(src.a6);\r
-                if (mask(y, x_shifted + 7))\r
-                    dst.a7 = op(src.a7);\r
-            }\r
-\r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
-            {\r
-                if (mask(y, x_shifted))\r
-                    dst.a0 = op(src1.a0, src2.a0);\r
-                if (mask(y, x_shifted + 1))\r
-                    dst.a1 = op(src1.a1, src2.a1);\r
-                if (mask(y, x_shifted + 2))\r
-                    dst.a2 = op(src1.a2, src2.a2);\r
-                if (mask(y, x_shifted + 3))\r
-                    dst.a3 = op(src1.a3, src2.a3);\r
-                if (mask(y, x_shifted + 4))\r
-                    dst.a4 = op(src1.a4, src2.a4);\r
-                if (mask(y, x_shifted + 5))\r
-                    dst.a5 = op(src1.a5, src2.a5);\r
-                if (mask(y, x_shifted + 6))\r
-                    dst.a6 = op(src1.a6, src2.a6);\r
-                if (mask(y, x_shifted + 7))\r
-                    dst.a7 = op(src1.a7, src2.a7);\r
-            }\r
-        };\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src.x);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.y = op(src.y);\r
+            if (mask(y, x_shifted + 2))\r
+                dst.z = op(src.z);\r
+        }\r
  \r
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+        {\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src1.x, src2.x);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.y = op(src1.y, src2.y);\r
+            if (mask(y, x_shifted + 2))\r
+                dst.z = op(src1.z, src2.z);\r
+        }\r
+    };\r
+    template <> struct OpUnroller<4>\r
+    {\r
          template <typename T, typename D, typename UnOp, typename Mask>\r
-        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)\r
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
          {\r
-            typedef TransformFunctorTraits<UnOp> ft;\r
-            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;\r
-            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;\r
-\r
-            const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
-            const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
-            const int x_shifted = x * ft::smart_shift;\r
-\r
-            if (y < src_.rows)\r
-            {\r
-                const T* src = src_.ptr(y);\r
-                D* dst = dst_.ptr(y);\r
-\r
-                if (x_shifted + ft::smart_shift - 1 < src_.cols)\r
-                {\r
-                    const read_type src_n_el = ((const read_type*)src)[x];\r
-                    write_type dst_n_el;\r
-\r
-                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);\r
-\r
-                    ((write_type*)dst)[x] = dst_n_el;\r
-                }\r
-                else\r
-                {\r
-                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)\r
-                    {\r
-                        if (mask(y, real_x))\r
-                            dst[real_x] = op(src[real_x]);\r
-                    }\r
-                }\r
-            }\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src.x);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.y = op(src.y);\r
+            if (mask(y, x_shifted + 2))\r
+                dst.z = op(src.z);\r
+            if (mask(y, x_shifted + 3))\r
+                dst.w = op(src.w);\r
          }\r
  \r
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
+        {\r
+            if (mask(y, x_shifted))\r
+                dst.x = op(src1.x, src2.x);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.y = op(src1.y, src2.y);\r
+            if (mask(y, x_shifted + 2))\r
+                dst.z = op(src1.z, src2.z);\r
+            if (mask(y, x_shifted + 3))\r
+                dst.w = op(src1.w, src2.w);\r
+        }\r
+    };\r
+    template <> struct OpUnroller<8>\r
+    {\r
          template <typename T, typename D, typename UnOp, typename Mask>\r
-        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)\r
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)\r
          {\r
-                   const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-                   const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
-            if (x < src.cols && y < src.rows && mask(y, x))\r
-            {\r
-                dst.ptr(y)[x] = op(src.ptr(y)[x]);\r
-            }\r
+            if (mask(y, x_shifted))\r
+                dst.a0 = op(src.a0);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.a1 = op(src.a1);\r
+            if (mask(y, x_shifted + 2))\r
+                dst.a2 = op(src.a2);\r
+            if (mask(y, x_shifted + 3))\r
+                dst.a3 = op(src.a3);\r
+            if (mask(y, x_shifted + 4))\r
+                dst.a4 = op(src.a4);\r
+            if (mask(y, x_shifted + 5))\r
+                dst.a5 = op(src.a5);\r
+            if (mask(y, x_shifted + 6))\r
+                dst.a6 = op(src.a6);\r
+            if (mask(y, x_shifted + 7))\r
+                dst.a7 = op(src.a7);\r
          }\r
  \r
          template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, \r
-            const Mask mask, const BinOp op)\r
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)\r
          {\r
-            typedef TransformFunctorTraits<BinOp> ft;\r
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;\r
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;\r
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;\r
+            if (mask(y, x_shifted))\r
+                dst.a0 = op(src1.a0, src2.a0);\r
+            if (mask(y, x_shifted + 1))\r
+                dst.a1 = op(src1.a1, src2.a1);\r
+            if (mask(y, x_shifted + 2))\r
+                dst.a2 = op(src1.a2, src2.a2);\r
+            if (mask(y, x_shifted + 3))\r
+                dst.a3 = op(src1.a3, src2.a3);\r
+            if (mask(y, x_shifted + 4))\r
+                dst.a4 = op(src1.a4, src2.a4);\r
+            if (mask(y, x_shifted + 5))\r
+                dst.a5 = op(src1.a5, src2.a5);\r
+            if (mask(y, x_shifted + 6))\r
+                dst.a6 = op(src1.a6, src2.a6);\r
+            if (mask(y, x_shifted + 7))\r
+                dst.a7 = op(src1.a7, src2.a7);\r
+        }\r
+    };\r
  \r
-            const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
-            const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
-            const int x_shifted = x * ft::smart_shift;\r
+    template <typename T, typename D, typename UnOp, typename Mask>\r
+    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)\r
+    {\r
+        typedef TransformFunctorTraits<UnOp> ft;\r
+        typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;\r
+        typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;\r
+\r
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
+        const int x_shifted = x * ft::smart_shift;\r
+\r
+        if (y < src_.rows)\r
+        {\r
+            const T* src = src_.ptr(y);\r
+            D* dst = dst_.ptr(y);\r
  \r
-            if (y < src1_.rows)\r
+            if (x_shifted + ft::smart_shift - 1 < src_.cols)\r
              {\r
-                const T1* src1 = src1_.ptr(y);\r
-                const T2* src2 = src2_.ptr(y);\r
-                D* dst = dst_.ptr(y);\r
+                const read_type src_n_el = ((const read_type*)src)[x];\r
+                write_type dst_n_el;\r
  \r
-                if (x_shifted + ft::smart_shift - 1 < src1_.cols)\r
-                {\r
-                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];\r
-                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];\r
-                    write_type dst_n_el;\r
-                    \r
-                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);\r
+                OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);\r
  \r
-                    ((write_type*)dst)[x] = dst_n_el;\r
-                }\r
-                else\r
+                ((write_type*)dst)[x] = dst_n_el;\r
+            }\r
+            else\r
+            {\r
+                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)\r
                  {\r
-                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)\r
-                    {\r
-                        if (mask(y, real_x))\r
-                            dst[real_x] = op(src1[real_x], src2[real_x]);\r
-                    }\r
+                    if (mask(y, real_x))\r
+                        dst[real_x] = op(src[real_x]);\r
                  }\r
              }\r
          }\r
+    }\r
  \r
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-        static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, \r
-            const Mask mask, const BinOp op)\r
-        {\r
-                   const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
-                   const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
-\r
-            if (x < src1.cols && y < src1.rows && mask(y, x))\r
-            {\r
-                const T1 src1_data = src1.ptr(y)[x];\r
-                const T2 src2_data = src2.ptr(y)[x];\r
-                dst.ptr(y)[x] = op(src1_data, src2_data);\r
-            }\r
-        }\r
+    template <typename T, typename D, typename UnOp, typename Mask>\r
+    static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)\r
+    {\r
+           const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+           const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-        template <bool UseSmart> struct TransformDispatcher;\r
-        template<> struct TransformDispatcher<false>\r
+        if (x < src.cols && y < src.rows && mask(y, x))\r
          {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
-            {\r
-                typedef TransformFunctorTraits<UnOp> ft;\r
+            dst.ptr(y)[x] = op(src.ptr(y)[x]);\r
+        }\r
+    }\r
  \r
-                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
-                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     \r
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, \r
+        const Mask mask, const BinOp op)\r
+    {\r
+        typedef TransformFunctorTraits<BinOp> ft;\r
+        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;\r
+        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;\r
+        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;\r
  \r
-                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
-                cudaSafeCall( cudaGetLastError() );\r
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;\r
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;\r
+        const int x_shifted = x * ft::smart_shift;\r
  \r
-                if (stream == 0)\r
-                    cudaSafeCall( cudaDeviceSynchronize() ); \r
-            }\r
+        if (y < src1_.rows)\r
+        {\r
+            const T1* src1 = src1_.ptr(y);\r
+            const T2* src2 = src2_.ptr(y);\r
+            D* dst = dst_.ptr(y);\r
  \r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
+            if (x_shifted + ft::smart_shift - 1 < src1_.cols)\r
              {\r
-                typedef TransformFunctorTraits<BinOp> ft;\r
-\r
-                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
-                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     \r
-\r
-                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
-                cudaSafeCall( cudaGetLastError() );\r
+                const read_type1 src1_n_el = ((const read_type1*)src1)[x];\r
+                const read_type2 src2_n_el = ((const read_type2*)src2)[x];\r
+                write_type dst_n_el;\r
+                \r
+                OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);\r
  \r
-                if (stream == 0)\r
-                    cudaSafeCall( cudaDeviceSynchronize() );            \r
+                ((write_type*)dst)[x] = dst_n_el;\r
              }\r
-        };\r
-        template<> struct TransformDispatcher<true>\r
-        {\r
-            template <typename T, typename D, typename UnOp, typename Mask>\r
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
+            else\r
              {\r
-                typedef TransformFunctorTraits<UnOp> ft;\r
+                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)\r
+                {\r
+                    if (mask(y, real_x))\r
+                        dst[real_x] = op(src1[real_x], src2[real_x]);\r
+                }\r
+            }\r
+        }\r
+    }\r
  \r
-                StaticAssert<ft::smart_shift != 1>::check();\r
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+    static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, \r
+        const Mask mask, const BinOp op)\r
+    {\r
+           const int x = blockDim.x * blockIdx.x + threadIdx.x;\r
+           const int y = blockDim.y * blockIdx.y + threadIdx.y;\r
  \r
-                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
-                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      \r
+        if (x < src1.cols && y < src1.rows && mask(y, x))\r
+        {\r
+            const T1 src1_data = src1.ptr(y)[x];\r
+            const T2 src2_data = src2.ptr(y)[x];\r
+            dst.ptr(y)[x] = op(src1_data, src2_data);\r
+        }\r
+    }\r
  \r
-                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
-                cudaSafeCall( cudaGetLastError() );\r
+    template <bool UseSmart> struct TransformDispatcher;\r
+    template<> struct TransformDispatcher<false>\r
+    {\r
+        template <typename T, typename D, typename UnOp, typename Mask>\r
+        static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
+        {\r
+            typedef TransformFunctorTraits<UnOp> ft;\r
  \r
-                if (stream == 0)\r
-                    cudaSafeCall( cudaDeviceSynchronize() );\r
-            }\r
+            const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
+            const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     \r
  \r
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
-            {\r
-                typedef TransformFunctorTraits<BinOp> ft;\r
+            transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
+            cudaSafeCall( cudaGetLastError() );\r
  \r
-                StaticAssert<ft::smart_shift != 1>::check();\r
+            if (stream == 0)\r
+                cudaSafeCall( cudaDeviceSynchronize() ); \r
+        }\r
  \r
-                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
-                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    \r
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+        static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
+        {\r
+            typedef TransformFunctorTraits<BinOp> ft;\r
  \r
-                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
-                cudaSafeCall( cudaGetLastError() );\r
+            const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);\r
+            const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     \r
  \r
-                if (stream == 0)\r
-                    cudaSafeCall( cudaDeviceSynchronize() );            \r
-            }\r
-        };        \r
+            transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
+            cudaSafeCall( cudaGetLastError() );\r
  \r
+            if (stream == 0)\r
+                cudaSafeCall( cudaDeviceSynchronize() );            \r
+        }\r
+    };\r
+    template<> struct TransformDispatcher<true>\r
+    {\r
          template <typename T, typename D, typename UnOp, typename Mask>\r
-        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
+        static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
          {\r
              typedef TransformFunctorTraits<UnOp> ft;\r
-            TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);\r
+\r
+            StaticAssert<ft::smart_shift != 1>::check();\r
+\r
+            const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
+            const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      \r
+\r
+            transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);\r
+            cudaSafeCall( cudaGetLastError() );\r
+\r
+            if (stream == 0)\r
+                cudaSafeCall( cudaDeviceSynchronize() );\r
          }\r
  \r
          template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
-        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
+        static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
          {\r
              typedef TransformFunctorTraits<BinOp> ft;\r
-            TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);\r
+\r
+            StaticAssert<ft::smart_shift != 1>::check();\r
+\r
+            const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);\r
+            const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    \r
+\r
+            transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);\r
+            cudaSafeCall( cudaGetLastError() );\r
+\r
+            if (stream == 0)\r
+                cudaSafeCall( cudaDeviceSynchronize() );            \r
          }\r
+    };        \r
+\r
+    template <typename T, typename D, typename UnOp, typename Mask>\r
+    static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)\r
+    {\r
+        typedef TransformFunctorTraits<UnOp> ft;\r
+        TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);\r
      }\r
-}}}\r
+\r
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>\r
+    static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)\r
+    {\r
+        typedef TransformFunctorTraits<BinOp> ft;\r
+        TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);\r
+    }\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp

index f6acce1..f2eb828 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
@@ -43,144 +43,146 @@
  #ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__\r
  #define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "../vec_traits.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace detail\r
  {\r
-    namespace detail\r
-    {\r
-        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };\r
-        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };\r
+    template <bool, typename T1, typename T2> struct Select { typedef T1 type; };\r
+    template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };\r
  \r
-        template <typename T> struct IsSignedIntergral { enum {value = 0}; };\r
-        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };\r
-        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };\r
-        template <> struct IsSignedIntergral<short> { enum {value = 1}; };\r
-        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };\r
-        template <> struct IsSignedIntergral<int> { enum {value = 1}; };\r
-        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };\r
+    template <typename T> struct IsSignedIntergral { enum {value = 0}; };\r
+    template <> struct IsSignedIntergral<schar> { enum {value = 1}; };\r
+    template <> struct IsSignedIntergral<char1> { enum {value = 1}; };\r
+    template <> struct IsSignedIntergral<short> { enum {value = 1}; };\r
+    template <> struct IsSignedIntergral<short1> { enum {value = 1}; };\r
+    template <> struct IsSignedIntergral<int> { enum {value = 1}; };\r
+    template <> struct IsSignedIntergral<int1> { enum {value = 1}; };\r
  \r
-        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };\r
-        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };\r
-        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };\r
-        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };\r
-        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };\r
-        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };\r
-        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };\r
+    template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };\r
+    template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };\r
+    template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };\r
+    template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };\r
+    template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };\r
+    template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };\r
+    template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };\r
  \r
-        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };\r
-        template <> struct IsIntegral<char> { enum {value = 1}; };\r
-        template <> struct IsIntegral<bool> { enum {value = 1}; };\r
+    template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };\r
+    template <> struct IsIntegral<char> { enum {value = 1}; };\r
+    template <> struct IsIntegral<bool> { enum {value = 1}; };\r
  \r
-        template <typename T> struct IsFloat { enum {value = 0}; };\r
-        template <> struct IsFloat<float> { enum {value = 1}; };\r
-        template <> struct IsFloat<double> { enum {value = 1}; };\r
+    template <typename T> struct IsFloat { enum {value = 0}; };\r
+    template <> struct IsFloat<float> { enum {value = 1}; };\r
+    template <> struct IsFloat<double> { enum {value = 1}; };\r
  \r
-        template <typename T> struct IsVec { enum {value = 0}; };\r
-        template <> struct IsVec<uchar1> { enum {value = 1}; };\r
-        template <> struct IsVec<uchar2> { enum {value = 1}; };\r
-        template <> struct IsVec<uchar3> { enum {value = 1}; };\r
-        template <> struct IsVec<uchar4> { enum {value = 1}; };\r
-        template <> struct IsVec<uchar8> { enum {value = 1}; };\r
-        template <> struct IsVec<char1> { enum {value = 1}; };\r
-        template <> struct IsVec<char2> { enum {value = 1}; };\r
-        template <> struct IsVec<char3> { enum {value = 1}; };\r
-        template <> struct IsVec<char4> { enum {value = 1}; };\r
-        template <> struct IsVec<char8> { enum {value = 1}; };\r
-        template <> struct IsVec<ushort1> { enum {value = 1}; };\r
-        template <> struct IsVec<ushort2> { enum {value = 1}; };\r
-        template <> struct IsVec<ushort3> { enum {value = 1}; };\r
-        template <> struct IsVec<ushort4> { enum {value = 1}; };\r
-        template <> struct IsVec<ushort8> { enum {value = 1}; };\r
-        template <> struct IsVec<short1> { enum {value = 1}; };\r
-        template <> struct IsVec<short2> { enum {value = 1}; };\r
-        template <> struct IsVec<short3> { enum {value = 1}; };\r
-        template <> struct IsVec<short4> { enum {value = 1}; };\r
-        template <> struct IsVec<short8> { enum {value = 1}; };\r
-        template <> struct IsVec<uint1> { enum {value = 1}; };\r
-        template <> struct IsVec<uint2> { enum {value = 1}; };\r
-        template <> struct IsVec<uint3> { enum {value = 1}; };\r
-        template <> struct IsVec<uint4> { enum {value = 1}; };\r
-        template <> struct IsVec<uint8> { enum {value = 1}; };\r
-        template <> struct IsVec<int1> { enum {value = 1}; };\r
-        template <> struct IsVec<int2> { enum {value = 1}; };\r
-        template <> struct IsVec<int3> { enum {value = 1}; };\r
-        template <> struct IsVec<int4> { enum {value = 1}; };\r
-        template <> struct IsVec<int8> { enum {value = 1}; };\r
-        template <> struct IsVec<float1> { enum {value = 1}; };\r
-        template <> struct IsVec<float2> { enum {value = 1}; };\r
-        template <> struct IsVec<float3> { enum {value = 1}; };\r
-        template <> struct IsVec<float4> { enum {value = 1}; };\r
-        template <> struct IsVec<float8> { enum {value = 1}; };\r
-        template <> struct IsVec<double1> { enum {value = 1}; };\r
-        template <> struct IsVec<double2> { enum {value = 1}; };\r
-        template <> struct IsVec<double3> { enum {value = 1}; };\r
-        template <> struct IsVec<double4> { enum {value = 1}; };\r
-        template <> struct IsVec<double8> { enum {value = 1}; };\r
+    template <typename T> struct IsVec { enum {value = 0}; };\r
+    template <> struct IsVec<uchar1> { enum {value = 1}; };\r
+    template <> struct IsVec<uchar2> { enum {value = 1}; };\r
+    template <> struct IsVec<uchar3> { enum {value = 1}; };\r
+    template <> struct IsVec<uchar4> { enum {value = 1}; };\r
+    template <> struct IsVec<uchar8> { enum {value = 1}; };\r
+    template <> struct IsVec<char1> { enum {value = 1}; };\r
+    template <> struct IsVec<char2> { enum {value = 1}; };\r
+    template <> struct IsVec<char3> { enum {value = 1}; };\r
+    template <> struct IsVec<char4> { enum {value = 1}; };\r
+    template <> struct IsVec<char8> { enum {value = 1}; };\r
+    template <> struct IsVec<ushort1> { enum {value = 1}; };\r
+    template <> struct IsVec<ushort2> { enum {value = 1}; };\r
+    template <> struct IsVec<ushort3> { enum {value = 1}; };\r
+    template <> struct IsVec<ushort4> { enum {value = 1}; };\r
+    template <> struct IsVec<ushort8> { enum {value = 1}; };\r
+    template <> struct IsVec<short1> { enum {value = 1}; };\r
+    template <> struct IsVec<short2> { enum {value = 1}; };\r
+    template <> struct IsVec<short3> { enum {value = 1}; };\r
+    template <> struct IsVec<short4> { enum {value = 1}; };\r
+    template <> struct IsVec<short8> { enum {value = 1}; };\r
+    template <> struct IsVec<uint1> { enum {value = 1}; };\r
+    template <> struct IsVec<uint2> { enum {value = 1}; };\r
+    template <> struct IsVec<uint3> { enum {value = 1}; };\r
+    template <> struct IsVec<uint4> { enum {value = 1}; };\r
+    template <> struct IsVec<uint8> { enum {value = 1}; };\r
+    template <> struct IsVec<int1> { enum {value = 1}; };\r
+    template <> struct IsVec<int2> { enum {value = 1}; };\r
+    template <> struct IsVec<int3> { enum {value = 1}; };\r
+    template <> struct IsVec<int4> { enum {value = 1}; };\r
+    template <> struct IsVec<int8> { enum {value = 1}; };\r
+    template <> struct IsVec<float1> { enum {value = 1}; };\r
+    template <> struct IsVec<float2> { enum {value = 1}; };\r
+    template <> struct IsVec<float3> { enum {value = 1}; };\r
+    template <> struct IsVec<float4> { enum {value = 1}; };\r
+    template <> struct IsVec<float8> { enum {value = 1}; };\r
+    template <> struct IsVec<double1> { enum {value = 1}; };\r
+    template <> struct IsVec<double2> { enum {value = 1}; };\r
+    template <> struct IsVec<double3> { enum {value = 1}; };\r
+    template <> struct IsVec<double4> { enum {value = 1}; };\r
+    template <> struct IsVec<double8> { enum {value = 1}; };\r
  \r
-        template <class U> struct AddParameterType { typedef const U& type; };\r
-        template <class U> struct AddParameterType<U&> { typedef U& type; };\r
-        template <> struct AddParameterType<void> { typedef void type; };\r
+    template <class U> struct AddParameterType { typedef const U& type; };\r
+    template <class U> struct AddParameterType<U&> { typedef U& type; };\r
+    template <> struct AddParameterType<void> { typedef void type; };\r
+\r
+    template <class U> struct ReferenceTraits \r
+    {\r
+        enum { value = false };\r
+        typedef U type;\r
+    };        \r
+    template <class U> struct ReferenceTraits<U&>\r
+    {\r
+        enum { value = true };\r
+        typedef U type;\r
+    };\r
+           \r
+    template <class U> struct PointerTraits\r
+    {\r
+        enum { value = false };\r
+        typedef void type;\r
+    };        \r
+    template <class U> struct PointerTraits<U*>\r
+    {\r
+        enum { value = true };\r
+        typedef U type;\r
+    };        \r
+    template <class U> struct PointerTraits<U*&>\r
+    {\r
+        enum { value = true };\r
+        typedef U type;\r
+    };\r
+     \r
+    template <class U> struct UnConst\r
+    {\r
+        typedef U type;\r
+        enum { value = 0 };\r
+    };        \r
+    template <class U> struct UnConst<const U>\r
+    {\r
+        typedef U type;\r
+        enum { value = 1 };\r
+    };\r
+    template <class U> struct UnConst<const U&>\r
+    {\r
+        typedef U& type;\r
+        enum { value = 1 };\r
+    };\r
  \r
-        template <class U> struct ReferenceTraits \r
-        {\r
-            enum { value = false };\r
-            typedef U type;\r
-        };        \r
-        template <class U> struct ReferenceTraits<U&>\r
-        {\r
-            enum { value = true };\r
-            typedef U type;\r
-        };\r
-               \r
-        template <class U> struct PointerTraits\r
-        {\r
-            enum { value = false };\r
-            typedef void type;\r
-        };        \r
-        template <class U> struct PointerTraits<U*>\r
-        {\r
-            enum { value = true };\r
-            typedef U type;\r
-        };        \r
-        template <class U> struct PointerTraits<U*&>\r
-        {\r
-            enum { value = true };\r
-            typedef U type;\r
-        };\r
-         \r
-        template <class U> struct UnConst\r
-        {\r
-            typedef U type;\r
-            enum { value = 0 };\r
-        };        \r
-        template <class U> struct UnConst<const U>\r
-        {\r
-            typedef U type;\r
-            enum { value = 1 };\r
-        };\r
-        template <class U> struct UnConst<const U&>\r
-        {\r
-            typedef U& type;\r
-            enum { value = 1 };\r
-        };\r
+    template <class U> struct UnVolatile\r
+    {\r
+        typedef U type;\r
+        enum { value = 0 };\r
+    };       \r
+    template <class U> struct UnVolatile<volatile U>\r
+    {\r
+        typedef U type;\r
+        enum { value = 1 };\r
+    };\r
+    template <class U> struct UnVolatile<volatile U&>\r
+    {\r
+        typedef U& type;\r
+        enum { value = 1 };\r
+    };\r
+}\r
  \r
-        template <class U> struct UnVolatile\r
-        {\r
-            typedef U type;\r
-            enum { value = 0 };\r
-        };       \r
-        template <class U> struct UnVolatile<volatile U>\r
-        {\r
-            typedef U type;\r
-            enum { value = 1 };\r
-        };\r
-        template <class U> struct UnVolatile<volatile U&>\r
-        {\r
-            typedef U& type;\r
-            enum { value = 1 };\r
-        };\r
-    }\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp

index e766662..2818c28 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
@@ -43,225 +43,462 @@
  #ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__\r
  #define __OPENCV_GPU_UTILITY_DETAIL_HPP__\r
  \r
-namespace cv { namespace gpu { namespace device\r
+#include "internal_shared.hpp"\r
+\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace detail\r
  {\r
-    namespace detail\r
-    {\r
-        ///////////////////////////////////////////////////////////////////////////////\r
-        // Reduction\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // Reduction\r
  \r
-        template <int n> struct WarpReductor\r
+    template <int n> struct WarpReductor\r
+    {\r
+        template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
          {\r
-            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
-            {\r
-                if (tid < n)\r
-                    data[tid] = partial_reduction;                \r
-                if (n > 32) __syncthreads();\r
+            if (tid < n)\r
+                data[tid] = partial_reduction;                \r
+            if (n > 32) __syncthreads();\r
  \r
-                if (n > 32)\r
+            if (n > 32)\r
+            {\r
+                if (tid < n - 32) \r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
+                if (tid < 16)\r
                  {\r
-                    if (tid < n - 32) \r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
-                    if (tid < 16)\r
-                    {\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
-                    }\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
                  }\r
-                else if (n > 16)\r
+            }\r
+            else if (n > 16)\r
+            {\r
+                if (tid < n - 16) \r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+                if (tid < 8)\r
                  {\r
-                    if (tid < n - 16) \r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
-                    if (tid < 8)\r
-                    {\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
-                    }\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
                  }\r
-                else if (n > 8)\r
+            }\r
+            else if (n > 8)\r
+            {\r
+                if (tid < n - 8) \r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
+                if (tid < 4)\r
                  {\r
-                    if (tid < n - 8) \r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
-                    if (tid < 4)\r
-                    {\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
-                    }\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
                  }\r
-                else if (n > 4)\r
+            }\r
+            else if (n > 4)\r
+            {\r
+                if (tid < n - 4) \r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
+                if (tid < 2)\r
                  {\r
-                    if (tid < n - 4) \r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
-                    if (tid < 2)\r
-                    {\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
-                    }\r
-                }   \r
-                else if (n > 2)\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
+                }\r
+            }   \r
+            else if (n > 2)\r
+            {\r
+                if (tid < n - 2) \r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
+                if (tid < 2)\r
                  {\r
-                    if (tid < n - 2) \r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
-                    if (tid < 2)\r
-                    {\r
-                        data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
-                    }\r
-                }      \r
+                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
+                }\r
+            }      \r
+        }\r
+    };\r
+    template <> struct WarpReductor<64>\r
+    {\r
+        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+        {\r
+            data[tid] = partial_reduction;\r
+            __syncthreads();\r
+            \r
+            if (tid < 32) \r
+            {\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
              }\r
-        };\r
-        template <> struct WarpReductor<64>\r
+        }\r
+    };\r
+    template <> struct WarpReductor<32>\r
+    {\r
+        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
          {\r
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+            data[tid] = partial_reduction;\r
+            \r
+            if (tid < 16) \r
              {\r
-                data[tid] = partial_reduction;\r
-                __syncthreads();\r
-                \r
-                if (tid < 32) \r
-                {\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
-                }\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
              }\r
-        };\r
-        template <> struct WarpReductor<32>\r
+        }\r
+    };\r
+    template <> struct WarpReductor<16>\r
+    {\r
+        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
          {\r
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+            data[tid] = partial_reduction;\r
+            \r
+            if (tid < 8) \r
              {\r
-                data[tid] = partial_reduction;\r
-                \r
-                if (tid < 16) \r
-                {\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
-                }\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
              }\r
-        };\r
-        template <> struct WarpReductor<16>\r
+        }\r
+    };\r
+    template <> struct WarpReductor<8>\r
+    {\r
+        template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
          {\r
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+            data[tid] = partial_reduction;\r
+            \r
+            if (tid < 4) \r
              {\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
+            }\r
+        }\r
+    };\r
+\r
+    template <bool warp> struct ReductionDispatcher;\r
+    template <> struct ReductionDispatcher<true>\r
+    {\r
+        template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+        {\r
+            WarpReductor<n>::reduce(data, partial_reduction, tid, op);\r
+        }\r
+    };\r
+    template <> struct ReductionDispatcher<false>\r
+    {\r
+        template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+        {\r
+            if (tid < n)\r
                  data[tid] = partial_reduction;\r
-                \r
-                if (tid < 8) \r
+            __syncthreads();\r
+\r
+\r
+            if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }\r
+            if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }\r
+            if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }\r
+\r
+            if (tid < 32)\r
+            {\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
+            }\r
+        }\r
+    };\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // PredValWarpReductor\r
+    \r
+    template <int n> struct PredValWarpReductor;\r
+    template <> struct PredValWarpReductor<64>\r
+    {\r
+        template <typename T, typename V, typename Pred> \r
+        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+        {\r
+            if (tid < 32)\r
+            {\r
+                myData = sdata[tid];\r
+                myVal = sval[tid];\r
+\r
+                T reg = sdata[tid + 32];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 32];\r
+                }\r
+\r
+                reg = sdata[tid + 16];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 16];\r
+                }\r
+\r
+                reg = sdata[tid + 8];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 8];\r
+                }\r
+\r
+                reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 4];\r
+                }\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
                  {\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 1];\r
                  }\r
              }\r
-        };\r
-        template <> struct WarpReductor<8>\r
+        }\r
+    };\r
+    template <> struct PredValWarpReductor<32>\r
+    {\r
+        template <typename T, typename V, typename Pred> \r
+        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
          {\r
-            template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+            if (tid < 16)\r
              {\r
-                data[tid] = partial_reduction;\r
-                \r
-                if (tid < 4) \r
+                myData = sdata[tid];\r
+                myVal = sval[tid];\r
+\r
+                T reg = sdata[tid + 16];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 16];\r
+                }\r
+\r
+                reg = sdata[tid + 8];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 8];\r
+                }\r
+\r
+                reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 4];\r
+                }\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
                  {\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); \r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 1];\r
                  }\r
              }\r
-        };\r
+        }\r
+    };\r
  \r
-        template <bool warp> struct ReductionDispatcher;\r
-        template <> struct ReductionDispatcher<true>\r
+    template <> struct PredValWarpReductor<16>\r
+    {\r
+        template <typename T, typename V, typename Pred> \r
+        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
          {\r
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+            if (tid < 8)\r
              {\r
-                WarpReductor<n>::reduce(data, partial_reduction, tid, op);\r
+                myData = sdata[tid];\r
+                myVal = sval[tid];\r
+\r
+                T reg = reg = sdata[tid + 8];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 8];\r
+                }\r
+\r
+                reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 4];\r
+                }\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 1];\r
+                }\r
              }\r
-        };\r
-        template <> struct ReductionDispatcher<false>\r
+        }\r
+    };\r
+    template <> struct PredValWarpReductor<8>\r
+    {\r
+        template <typename T, typename V, typename Pred> \r
+        static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
          {\r
-            template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+            if (tid < 4)\r
              {\r
-                if (tid < n)\r
-                    data[tid] = partial_reduction;\r
-                __syncthreads();\r
+                myData = sdata[tid];\r
+                myVal = sval[tid];\r
  \r
+                T reg = reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 4];\r
+                }\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 1];\r
+                }\r
+            }\r
+        }\r
+    };\r
+\r
+    template <bool warp> struct PredValReductionDispatcher;\r
+    template <> struct PredValReductionDispatcher<true>\r
+    {\r
+        template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+        {\r
+            PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);\r
+        }\r
+    };\r
+    template <> struct PredValReductionDispatcher<false>\r
+    {\r
+        template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+        {\r
+            myData = sdata[tid];\r
+            myVal = sval[tid];\r
  \r
-                if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }\r
-                if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }\r
-                if (n >= 128) { if (tid <  64) { data[tid] = partial_reduction = op(partial_reduction, data[tid +  64]); } __syncthreads(); }\r
+            if (n >= 512 && tid < 256) \r
+            {\r
+                T reg = sdata[tid + 256];\r
  \r
-                if (tid < 32)\r
+                if (pred(reg, myData))\r
                  {\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);\r
-                    data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 256];\r
                  }\r
+                __syncthreads(); \r
              }\r
-        };\r
+            if (n >= 256 && tid < 128) \r
+            {\r
+                T reg = sdata[tid + 128];\r
  \r
-        ///////////////////////////////////////////////////////////////////////////////\r
-        // PredValWarpReductor\r
-        \r
-        template <int n> struct PredValWarpReductor;\r
-        template <> struct PredValWarpReductor<64>\r
-        {\r
-            template <typename T, typename V, typename Pred> \r
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 128];\r
+                }\r
+                __syncthreads(); \r
+            }\r
+            if (n >= 128 && tid < 64) \r
              {\r
-                if (tid < 32)\r
+                T reg = sdata[tid + 64];\r
+\r
+                if (pred(reg, myData))\r
                  {\r
-                    myData = sdata[tid];\r
-                    myVal = sval[tid];\r
+                    sdata[tid] = myData = reg;\r
+                    sval[tid] = myVal = sval[tid + 64];\r
+                }\r
+                __syncthreads(); \r
+            }        \r
  \r
+            if (tid < 32)\r
+            {\r
+                if (n >= 64) \r
+                { \r
                      T reg = sdata[tid + 32];\r
+\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval[tid] = myVal = sval[tid + 32];\r
                      }\r
+                }\r
+                if (n >= 32) \r
+                { \r
+                    T reg = sdata[tid + 16];\r
  \r
-                    reg = sdata[tid + 16];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval[tid] = myVal = sval[tid + 16];\r
                      }\r
+                }\r
+                if (n >= 16) \r
+                { \r
+                    T reg = sdata[tid + 8];\r
  \r
-                    reg = sdata[tid + 8];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval[tid] = myVal = sval[tid + 8];\r
                      }\r
+                }\r
+                if (n >= 8) \r
+                { \r
+                    T reg = sdata[tid + 4];\r
  \r
-                    reg = sdata[tid + 4];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval[tid] = myVal = sval[tid + 4];\r
                      }\r
-                \r
-                    reg = sdata[tid + 2];\r
+                }\r
+                if (n >= 4) \r
+                { \r
+                    T reg = sdata[tid + 2];\r
+\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval[tid] = myVal = sval[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
+                    } \r
+                }\r
+                if (n >= 2) \r
+                { \r
+                    T reg = sdata[tid + 1];\r
+\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
@@ -269,436 +506,327 @@ namespace cv { namespace gpu { namespace device
                      }\r
                  }\r
              }\r
-        };\r
-        template <> struct PredValWarpReductor<32>\r
+        }\r
+    };\r
+\r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // PredVal2WarpReductor\r
+\r
+    template <int n> struct PredVal2WarpReductor;\r
+    template <> struct PredVal2WarpReductor<64>\r
+    {\r
+        template <typename T, typename V1, typename V2, typename Pred> \r
+        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
          {\r
-            template <typename T, typename V, typename Pred> \r
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+            if (tid < 32)\r
              {\r
-                if (tid < 16)\r
+                myData = sdata[tid];\r
+                myVal1 = sval1[tid];\r
+                myVal2 = sval2[tid];\r
+\r
+                T reg = sdata[tid + 32];\r
+                if (pred(reg, myData))\r
                  {\r
-                    myData = sdata[tid];\r
-                    myVal = sval[tid];\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 32];\r
+                    sval2[tid] = myVal2 = sval2[tid + 32];\r
+                }\r
  \r
-                    T reg = sdata[tid + 16];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 16];\r
-                    }\r
+                reg = sdata[tid + 16];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 16];\r
+                    sval2[tid] = myVal2 = sval2[tid + 16];\r
+                }\r
  \r
-                    reg = sdata[tid + 8];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 8];\r
-                    }\r
+                reg = sdata[tid + 8];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 8];\r
+                    sval2[tid] = myVal2 = sval2[tid + 8];\r
+                }\r
  \r
-                    reg = sdata[tid + 4];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 4];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 2];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 1];\r
-                    }\r
+                reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 4];\r
+                    sval2[tid] = myVal2 = sval2[tid + 4];\r
+                }\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 2];\r
+                    sval2[tid] = myVal2 = sval2[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 1];\r
+                    sval2[tid] = myVal2 = sval2[tid + 1];\r
                  }\r
              }\r
-        };\r
-\r
-        template <> struct PredValWarpReductor<16>\r
+        }\r
+    };\r
+    template <> struct PredVal2WarpReductor<32>\r
+    {\r
+        template <typename T, typename V1, typename V2, typename Pred> \r
+        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
          {\r
-            template <typename T, typename V, typename Pred> \r
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+            if (tid < 16)\r
              {\r
-                if (tid < 8)\r
+                myData = sdata[tid];\r
+                myVal1 = sval1[tid];\r
+                myVal2 = sval2[tid];\r
+\r
+                T reg = sdata[tid + 16];\r
+                if (pred(reg, myData))\r
                  {\r
-                    myData = sdata[tid];\r
-                    myVal = sval[tid];\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 16];\r
+                    sval2[tid] = myVal2 = sval2[tid + 16];\r
+                }\r
  \r
-                    T reg = reg = sdata[tid + 8];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 8];\r
-                    }\r
+                reg = sdata[tid + 8];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 8];\r
+                    sval2[tid] = myVal2 = sval2[tid + 8];\r
+                }\r
  \r
-                    reg = sdata[tid + 4];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 4];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 2];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 1];\r
-                    }\r
+                reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 4];\r
+                    sval2[tid] = myVal2 = sval2[tid + 4];\r
                  }\r
-            }\r
-        };\r
-        template <> struct PredValWarpReductor<8>\r
-        {\r
-            template <typename T, typename V, typename Pred> \r
-            static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
-            {\r
-                if (tid < 4)\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
                  {\r
-                    myData = sdata[tid];\r
-                    myVal = sval[tid];\r
-\r
-                    T reg = reg = sdata[tid + 4];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 4];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 2];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 1];\r
-                    }\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 2];\r
+                    sval2[tid] = myVal2 = sval2[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 1];\r
+                    sval2[tid] = myVal2 = sval2[tid + 1];\r
                  }\r
              }\r
-        };\r
+        }\r
+    };\r
  \r
-        template <bool warp> struct PredValReductionDispatcher;\r
-        template <> struct PredValReductionDispatcher<true>\r
+    template <> struct PredVal2WarpReductor<16>\r
+    {\r
+        template <typename T, typename V1, typename V2, typename Pred> \r
+        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
          {\r
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+            if (tid < 8)\r
              {\r
-                PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);\r
+                myData = sdata[tid];\r
+                myVal1 = sval1[tid];\r
+                myVal2 = sval2[tid];\r
+\r
+                T reg = reg = sdata[tid + 8];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 8];\r
+                    sval2[tid] = myVal2 = sval2[tid + 8];\r
+                }\r
+\r
+                reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 4];\r
+                    sval2[tid] = myVal2 = sval2[tid + 4];\r
+                }\r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 2];\r
+                    sval2[tid] = myVal2 = sval2[tid + 2];\r
+                }\r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 1];\r
+                    sval2[tid] = myVal2 = sval2[tid + 1];\r
+                }\r
              }\r
-        };\r
-        template <> struct PredValReductionDispatcher<false>\r
+        }\r
+    };\r
+    template <> struct PredVal2WarpReductor<8>\r
+    {\r
+        template <typename T, typename V1, typename V2, typename Pred> \r
+        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
          {\r
-            template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)\r
+            if (tid < 4)\r
              {\r
                  myData = sdata[tid];\r
-                myVal = sval[tid];\r
+                myVal1 = sval1[tid];\r
+                myVal2 = sval2[tid];\r
  \r
-                if (n >= 512 && tid < 256) \r
+                T reg = reg = sdata[tid + 4];\r
+                if (pred(reg, myData))\r
                  {\r
-                    T reg = sdata[tid + 256];\r
-\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 256];\r
-                    }\r
-                    __syncthreads(); \r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 4];\r
+                    sval2[tid] = myVal2 = sval2[tid + 4];\r
                  }\r
-                if (n >= 256 && tid < 128) \r
+            \r
+                reg = sdata[tid + 2];\r
+                if (pred(reg, myData))\r
                  {\r
-                    T reg = sdata[tid + 128];\r
-\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 128];\r
-                    }\r
-                    __syncthreads(); \r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 2];\r
+                    sval2[tid] = myVal2 = sval2[tid + 2];\r
                  }\r
-                if (n >= 128 && tid < 64) \r
+            \r
+                reg = sdata[tid + 1];\r
+                if (pred(reg, myData))\r
                  {\r
-                    T reg = sdata[tid + 64];\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 1];\r
+                    sval2[tid] = myVal2 = sval2[tid + 1];\r
+                }\r
+            }\r
+        }\r
+    };\r
  \r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval[tid] = myVal = sval[tid + 64];\r
-                    }\r
-                    __syncthreads(); \r
-                }        \r
+    template <bool warp> struct PredVal2ReductionDispatcher;\r
+    template <> struct PredVal2ReductionDispatcher<true>\r
+    {\r
+        template <int n, typename T, typename V1, typename V2, typename Pred> \r
+        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+        {\r
+            PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
+        }\r
+    };\r
+    template <> struct PredVal2ReductionDispatcher<false>\r
+    {\r
+        template <int n, typename T, typename V1, typename V2, typename Pred> \r
+        static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+        {\r
+            myData = sdata[tid];\r
+            myVal1 = sval1[tid];\r
+            myVal2 = sval2[tid];\r
  \r
-                if (tid < 32)\r
-                {\r
-                    if (n >= 64) \r
-                    { \r
-                        T reg = sdata[tid + 32];\r
+            if (n >= 512 && tid < 256) \r
+            {\r
+                T reg = sdata[tid + 256];\r
  \r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval[tid] = myVal = sval[tid + 32];\r
-                        }\r
-                    }\r
-                    if (n >= 32) \r
-                    { \r
-                        T reg = sdata[tid + 16];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval[tid] = myVal = sval[tid + 16];\r
-                        }\r
-                    }\r
-                    if (n >= 16) \r
-                    { \r
-                        T reg = sdata[tid + 8];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval[tid] = myVal = sval[tid + 8];\r
-                        }\r
-                    }\r
-                    if (n >= 8) \r
-                    { \r
-                        T reg = sdata[tid + 4];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval[tid] = myVal = sval[tid + 4];\r
-                        }\r
-                    }\r
-                    if (n >= 4) \r
-                    { \r
-                        T reg = sdata[tid + 2];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval[tid] = myVal = sval[tid + 2];\r
-                        } \r
-                    }\r
-                    if (n >= 2) \r
-                    { \r
-                        T reg = sdata[tid + 1];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval[tid] = myVal = sval[tid + 1];\r
-                        }\r
-                    }\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 256];\r
+                    sval2[tid] = myVal2 = sval2[tid + 256];\r
                  }\r
+                __syncthreads(); \r
              }\r
-        };\r
-\r
-        ///////////////////////////////////////////////////////////////////////////////\r
-        // PredVal2WarpReductor\r
+            if (n >= 256 && tid < 128) \r
+            {\r
+                T reg = sdata[tid + 128];\r
  \r
-        template <int n> struct PredVal2WarpReductor;\r
-        template <> struct PredVal2WarpReductor<64>\r
-        {\r
-            template <typename T, typename V1, typename V2, typename Pred> \r
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
+                if (pred(reg, myData))\r
+                {\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 128];\r
+                    sval2[tid] = myVal2 = sval2[tid + 128];\r
+                }\r
+                __syncthreads(); \r
+            }\r
+            if (n >= 128 && tid < 64) \r
              {\r
-                if (tid < 32)\r
+                T reg = sdata[tid + 64];\r
+\r
+                if (pred(reg, myData))\r
                  {\r
-                    myData = sdata[tid];\r
-                    myVal1 = sval1[tid];\r
-                    myVal2 = sval2[tid];\r
+                    sdata[tid] = myData = reg;\r
+                    sval1[tid] = myVal1 = sval1[tid + 64];\r
+                    sval2[tid] = myVal2 = sval2[tid + 64];\r
+                }\r
+                __syncthreads(); \r
+            }        \r
  \r
+            if (tid < 32)\r
+            {\r
+                if (n >= 64) \r
+                { \r
                      T reg = sdata[tid + 32];\r
+\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval1[tid] = myVal1 = sval1[tid + 32];\r
                          sval2[tid] = myVal2 = sval2[tid + 32];\r
                      }\r
-\r
-                    reg = sdata[tid + 16];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 16];\r
-                        sval2[tid] = myVal2 = sval2[tid + 16];\r
-                    }\r
-\r
-                    reg = sdata[tid + 8];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 8];\r
-                        sval2[tid] = myVal2 = sval2[tid + 8];\r
-                    }\r
-\r
-                    reg = sdata[tid + 4];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 4];\r
-                        sval2[tid] = myVal2 = sval2[tid + 4];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 2];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 2];\r
-                        sval2[tid] = myVal2 = sval2[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 1];\r
-                        sval2[tid] = myVal2 = sval2[tid + 1];\r
-                    }\r
                  }\r
-            }\r
-        };\r
-        template <> struct PredVal2WarpReductor<32>\r
-        {\r
-            template <typename T, typename V1, typename V2, typename Pred> \r
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
-            {\r
-                if (tid < 16)\r
-                {\r
-                    myData = sdata[tid];\r
-                    myVal1 = sval1[tid];\r
-                    myVal2 = sval2[tid];\r
-\r
+                if (n >= 32) \r
+                { \r
                      T reg = sdata[tid + 16];\r
+\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval1[tid] = myVal1 = sval1[tid + 16];\r
                          sval2[tid] = myVal2 = sval2[tid + 16];\r
                      }\r
+                }\r
+                if (n >= 16) \r
+                { \r
+                    T reg = sdata[tid + 8];\r
  \r
-                    reg = sdata[tid + 8];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval1[tid] = myVal1 = sval1[tid + 8];\r
                          sval2[tid] = myVal2 = sval2[tid + 8];\r
                      }\r
+                }\r
+                if (n >= 8) \r
+                { \r
+                    T reg = sdata[tid + 4];\r
  \r
-                    reg = sdata[tid + 4];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval1[tid] = myVal1 = sval1[tid + 4];\r
                          sval2[tid] = myVal2 = sval2[tid + 4];\r
                      }\r
-                \r
-                    reg = sdata[tid + 2];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 2];\r
-                        sval2[tid] = myVal2 = sval2[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 1];\r
-                        sval2[tid] = myVal2 = sval2[tid + 1];\r
-                    }\r
                  }\r
-            }\r
-        };\r
-\r
-        template <> struct PredVal2WarpReductor<16>\r
-        {\r
-            template <typename T, typename V1, typename V2, typename Pred> \r
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
-            {\r
-                if (tid < 8)\r
-                {\r
-                    myData = sdata[tid];\r
-                    myVal1 = sval1[tid];\r
-                    myVal2 = sval2[tid];\r
-\r
-                    T reg = reg = sdata[tid + 8];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 8];\r
-                        sval2[tid] = myVal2 = sval2[tid + 8];\r
-                    }\r
+                if (n >= 4) \r
+                { \r
+                    T reg = sdata[tid + 2];\r
  \r
-                    reg = sdata[tid + 4];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 4];\r
-                        sval2[tid] = myVal2 = sval2[tid + 4];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 2];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
                          sval1[tid] = myVal1 = sval1[tid + 2];\r
                          sval2[tid] = myVal2 = sval2[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 1];\r
-                        sval2[tid] = myVal2 = sval2[tid + 1];\r
-                    }\r
+                    } \r
                  }\r
-            }\r
-        };\r
-        template <> struct PredVal2WarpReductor<8>\r
-        {\r
-            template <typename T, typename V1, typename V2, typename Pred> \r
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
-            {\r
-                if (tid < 4)\r
-                {\r
-                    myData = sdata[tid];\r
-                    myVal1 = sval1[tid];\r
-                    myVal2 = sval2[tid];\r
+                if (n >= 2) \r
+                { \r
+                    T reg = sdata[tid + 1];\r
  \r
-                    T reg = reg = sdata[tid + 4];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 4];\r
-                        sval2[tid] = myVal2 = sval2[tid + 4];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 2];\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 2];\r
-                        sval2[tid] = myVal2 = sval2[tid + 2];\r
-                    }\r
-                \r
-                    reg = sdata[tid + 1];\r
                      if (pred(reg, myData))\r
                      {\r
                          sdata[tid] = myData = reg;\r
@@ -707,135 +835,10 @@ namespace cv { namespace gpu { namespace device
                      }\r
                  }\r
              }\r
-        };\r
-\r
-        template <bool warp> struct PredVal2ReductionDispatcher;\r
-        template <> struct PredVal2ReductionDispatcher<true>\r
-        {\r
-            template <int n, typename T, typename V1, typename V2, typename Pred> \r
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
-            {\r
-                PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
-            }\r
-        };\r
-        template <> struct PredVal2ReductionDispatcher<false>\r
-        {\r
-            template <int n, typename T, typename V1, typename V2, typename Pred> \r
-            static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)\r
-            {\r
-                myData = sdata[tid];\r
-                myVal1 = sval1[tid];\r
-                myVal2 = sval2[tid];\r
-\r
-                if (n >= 512 && tid < 256) \r
-                {\r
-                    T reg = sdata[tid + 256];\r
-\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 256];\r
-                        sval2[tid] = myVal2 = sval2[tid + 256];\r
-                    }\r
-                    __syncthreads(); \r
-                }\r
-                if (n >= 256 && tid < 128) \r
-                {\r
-                    T reg = sdata[tid + 128];\r
-\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 128];\r
-                        sval2[tid] = myVal2 = sval2[tid + 128];\r
-                    }\r
-                    __syncthreads(); \r
-                }\r
-                if (n >= 128 && tid < 64) \r
-                {\r
-                    T reg = sdata[tid + 64];\r
-\r
-                    if (pred(reg, myData))\r
-                    {\r
-                        sdata[tid] = myData = reg;\r
-                        sval1[tid] = myVal1 = sval1[tid + 64];\r
-                        sval2[tid] = myVal2 = sval2[tid + 64];\r
-                    }\r
-                    __syncthreads(); \r
-                }        \r
+        }\r
+    };\r
+}\r
  \r
-                if (tid < 32)\r
-                {\r
-                    if (n >= 64) \r
-                    { \r
-                        T reg = sdata[tid + 32];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval1[tid] = myVal1 = sval1[tid + 32];\r
-                            sval2[tid] = myVal2 = sval2[tid + 32];\r
-                        }\r
-                    }\r
-                    if (n >= 32) \r
-                    { \r
-                        T reg = sdata[tid + 16];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval1[tid] = myVal1 = sval1[tid + 16];\r
-                            sval2[tid] = myVal2 = sval2[tid + 16];\r
-                        }\r
-                    }\r
-                    if (n >= 16) \r
-                    { \r
-                        T reg = sdata[tid + 8];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval1[tid] = myVal1 = sval1[tid + 8];\r
-                            sval2[tid] = myVal2 = sval2[tid + 8];\r
-                        }\r
-                    }\r
-                    if (n >= 8) \r
-                    { \r
-                        T reg = sdata[tid + 4];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval1[tid] = myVal1 = sval1[tid + 4];\r
-                            sval2[tid] = myVal2 = sval2[tid + 4];\r
-                        }\r
-                    }\r
-                    if (n >= 4) \r
-                    { \r
-                        T reg = sdata[tid + 2];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval1[tid] = myVal1 = sval1[tid + 2];\r
-                            sval2[tid] = myVal2 = sval2[tid + 2];\r
-                        } \r
-                    }\r
-                    if (n >= 2) \r
-                    { \r
-                        T reg = sdata[tid + 1];\r
-\r
-                        if (pred(reg, myData))\r
-                        {\r
-                            sdata[tid] = myData = reg;\r
-                            sval1[tid] = myVal1 = sval1[tid + 1];\r
-                            sval2[tid] = myVal2 = sval2[tid + 1];\r
-                        }\r
-                    }\r
-                }\r
-            }\r
-        };\r
-    }\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp

index 5171654..b27fd75 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
@@ -43,75 +43,77 @@
  #ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__\r
  #define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "../datamov_utils.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace detail\r
  {\r
-    namespace detail\r
+    template <int THREAD_DIM, int N> struct UnrollVecDiffCached\r
      {\r
-        template <int THREAD_DIM, int N> struct UnrollVecDiffCached\r
+        template <typename Dist, typename T1, typename T2>\r
+        static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)\r
          {\r
-            template <typename Dist, typename T1, typename T2>\r
-            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)\r
+            if (ind < len)\r
              {\r
-                if (ind < len)\r
-                {\r
-                    T1 val1 = *vecCached++;\r
+                T1 val1 = *vecCached++;\r
  \r
-                    T2 val2;\r
-                    ForceGlob<T2>::Load(vecGlob, ind, val2);\r
+                T2 val2;\r
+                ForceGlob<T2>::Load(vecGlob, ind, val2);\r
  \r
-                    dist.reduceIter(val1, val2);\r
+                dist.reduceIter(val1, val2);\r
  \r
-                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);\r
-                }\r
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);\r
              }\r
+        }\r
  \r
-            template <typename Dist, typename T1, typename T2>\r
-            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)\r
-            {\r
-                T1 val1 = *vecCached++;\r
+        template <typename Dist, typename T1, typename T2>\r
+        static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)\r
+        {\r
+            T1 val1 = *vecCached++;\r
  \r
-                T2 val2;\r
-                ForceGlob<T2>::Load(vecGlob, 0, val2);\r
-                vecGlob += THREAD_DIM;\r
+            T2 val2;\r
+            ForceGlob<T2>::Load(vecGlob, 0, val2);\r
+            vecGlob += THREAD_DIM;\r
  \r
-                dist.reduceIter(val1, val2);\r
+            dist.reduceIter(val1, val2);\r
  \r
-                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);\r
-            }\r
-        };\r
-        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>\r
+            UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);\r
+        }\r
+    };\r
+    template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>\r
+    {\r
+        template <typename Dist, typename T1, typename T2>\r
+        static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)\r
          {\r
-            template <typename Dist, typename T1, typename T2>\r
-            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)\r
-            {\r
-            }\r
+        }\r
  \r
-            template <typename Dist, typename T1, typename T2>\r
-            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)\r
-            {\r
-            }\r
-        };\r
+        template <typename Dist, typename T1, typename T2>\r
+        static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)\r
+        {\r
+        }\r
+    };\r
  \r
-        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;\r
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>\r
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;\r
+    template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>\r
+    {\r
+        template <typename Dist, typename T1, typename T2>\r
+        static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
          {\r
-            template <typename Dist, typename T1, typename T2>\r
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
-            {\r
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);\r
-            }\r
-        };\r
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>\r
+            UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);\r
+        }\r
+    };\r
+    template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>\r
+    {\r
+        template <typename Dist, typename T1, typename T2>\r
+        static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
          {\r
-            template <typename Dist, typename T1, typename T2>\r
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)\r
-            {\r
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);\r
-            }\r
-        };\r
-    }\r
-}}}\r
+            UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);\r
+        }\r
+    };\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp

index 7ce6994..5d1308a 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@@ -43,38 +43,41 @@
  #ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__\r
  #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{   \r
-    template<class T> struct DynamicSharedMem\r
+#include "internal_shared.hpp"\r
+\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+   \r
+template<class T> struct DynamicSharedMem\r
+{\r
+    __device__ __forceinline__ operator T*()\r
+    {\r
+        extern __shared__ int __smem[];\r
+        return (T*)__smem;\r
+    }\r
+\r
+    __device__ __forceinline__ operator const T*() const\r
      {\r
-        __device__ __forceinline__ operator T*()\r
-        {\r
-            extern __shared__ int __smem[];\r
-            return (T*)__smem;\r
-        }\r
+        extern __shared__ int __smem[];\r
+        return (T*)__smem;\r
+    }\r
+};\r
  \r
-        __device__ __forceinline__ operator const T*() const\r
-        {\r
-            extern __shared__ int __smem[];\r
-            return (T*)__smem;\r
-        }\r
-    };\r
+// specialize for double to avoid unaligned memory access compile errors\r
+template<> struct DynamicSharedMem<double>\r
+{\r
+    __device__ __forceinline__ operator double*()\r
+    {\r
+        extern __shared__ double __smem_d[];\r
+        return (double*)__smem_d;\r
+    }\r
  \r
-    // specialize for double to avoid unaligned memory access compile errors\r
-    template<> struct DynamicSharedMem<double>\r
+    __device__ __forceinline__ operator const double*() const\r
      {\r
-        __device__ __forceinline__ operator double*()\r
-        {\r
-            extern __shared__ double __smem_d[];\r
-            return (double*)__smem_d;\r
-        }\r
+        extern __shared__ double __smem_d[];\r
+        return (double*)__smem_d;\r
+    }\r
+};\r
  \r
-        __device__ __forceinline__ operator const double*() const\r
-        {\r
-            extern __shared__ double __smem_d[];\r
-            return (double*)__smem_d;\r
-        }\r
-    };\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp

index f9c8d81..7220c81 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -43,27 +43,26 @@
  #ifndef OPENCV_GPU_EMULATION_HPP_\r
  #define OPENCV_GPU_EMULATION_HPP_\r
  \r
-#include "opencv2/gpu/device/warp_reduce.hpp"\r
+#include "internal_shared.hpp"\r
+#include "warp_reduce.hpp"\r
  \r
-namespace cv\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+struct Emulation\r
  {\r
-       namespace device\r
+       static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)\r
         {\r
-               struct Emulation\r
-               {\r
-                       static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)\r
-                       {\r
  #if __CUDA_ARCH__ >= 200\r
-                               (void)cta_buffer;\r
-                               return __ballot(predicate);\r
+               (void)cta_buffer;\r
+               return __ballot(predicate);\r
  #else\r
-                               int tid = threadIdx.x;                          \r
-                               cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
-                               return warp_reduce(cta_buffer);\r
+               int tid = threadIdx.x;                          \r
+               cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\r
+               return warp_reduce(cta_buffer);\r
  #endif\r
-                       }\r
-               };\r
         }\r
-}\r
+};\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif /* OPENCV_GPU_EMULATION_HPP_ */
 \ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/filters.hpp b/modules/gpu/src/opencv2/gpu/device/filters.hpp

index 2f8d012..5c54bd9 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@@ -43,93 +43,95 @@
  #ifndef __OPENCV_GPU_FILTERS_HPP__\r
  #define __OPENCV_GPU_FILTERS_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "saturate_cast.hpp"\r
  #include "vec_traits.hpp"\r
  #include "vec_math.hpp"\r
  \r
-namespace cv {  namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename Ptr2D> struct PointFilter\r
  {\r
-    template <typename Ptr2D> struct PointFilter\r
+    typedef typename Ptr2D::elem_type elem_type;\r
+    typedef float index_type;\r
+\r
+    explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}\r
+     \r
+    __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
      {\r
-        typedef typename Ptr2D::elem_type elem_type;\r
-        typedef float index_type;\r
+        return src(__float2int_rn(y), __float2int_rn(x));\r
+    }\r
+\r
+    const Ptr2D src;\r
+};\r
  \r
-        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}\r
-         \r
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
-        {\r
-            return src(__float2int_rn(y), __float2int_rn(x));\r
-        }\r
+template <typename Ptr2D> struct LinearFilter\r
+{\r
+    typedef typename Ptr2D::elem_type elem_type;\r
+    typedef float index_type;\r
  \r
-        const Ptr2D src;\r
-    };\r
+    explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}\r
  \r
-    template <typename Ptr2D> struct LinearFilter\r
+    __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
      {\r
-        typedef typename Ptr2D::elem_type elem_type;\r
-        typedef float index_type;\r
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
  \r
-        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}\r
+        work_type out = VecTraits<work_type>::all(0);\r
  \r
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const\r
-        {\r
-            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
+        const int x1 = __float2int_rd(x);\r
+        const int y1 = __float2int_rd(y);\r
+        const int x2 = x1 + 1;\r
+        const int y2 = y1 + 1;\r
  \r
-            work_type out = VecTraits<work_type>::all(0);\r
+        elem_type src_reg = src(y1, x1);\r
+        out = out + src_reg * ((x2 - x) * (y2 - y));\r
  \r
-            const int x1 = __float2int_rd(x);\r
-            const int y1 = __float2int_rd(y);\r
-            const int x2 = x1 + 1;\r
-            const int y2 = y1 + 1;\r
+        src_reg = src(y1, x2);\r
+        out = out + src_reg * ((x - x1) * (y2 - y));\r
  \r
-            elem_type src_reg = src(y1, x1);\r
-            out = out + src_reg * ((x2 - x) * (y2 - y));\r
+        src_reg = src(y2, x1);\r
+        out = out + src_reg * ((x2 - x) * (y - y1));\r
  \r
-            src_reg = src(y1, x2);\r
-            out = out + src_reg * ((x - x1) * (y2 - y));\r
+        src_reg = src(y2, x2);\r
+        out = out + src_reg * ((x - x1) * (y - y1));\r
  \r
-            src_reg = src(y2, x1);\r
-            out = out + src_reg * ((x2 - x) * (y - y1));\r
+        return saturate_cast<elem_type>(out);\r
+    }\r
  \r
-            src_reg = src(y2, x2);\r
-            out = out + src_reg * ((x - x1) * (y - y1));\r
+    const Ptr2D src;\r
+};\r
  \r
-            return saturate_cast<elem_type>(out);\r
-        }\r
+template <typename Ptr2D> struct CubicFilter\r
+{\r
+    typedef typename Ptr2D::elem_type elem_type;\r
+    typedef float index_type;\r
+    typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
  \r
-        const Ptr2D src;\r
-    };\r
+    explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}\r
      \r
-    template <typename Ptr2D> struct CubicFilter\r
+    static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) \r
      {\r
-        typedef typename Ptr2D::elem_type elem_type;\r
-        typedef float index_type;\r
-        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;\r
+        return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));\r
+    }\r
  \r
-        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}\r
+    __device__ elem_type operator ()(float y, float x) const\r
+    {\r
+        const int xi = __float2int_rn(x);\r
+        const int yi = __float2int_rn(y);\r
+        \r
+        work_type arr[4];\r
          \r
-        static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) \r
-        {\r
-               return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));\r
-        }\r
-\r
-        __device__ elem_type operator ()(float y, float x) const\r
-        {\r
-            const int xi = __float2int_rn(x);\r
-            const int yi = __float2int_rn(y);\r
-            \r
-               work_type arr[4];\r
-               \r
-               arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);\r
-               arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);\r
-               arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);\r
-               arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);\r
-               \r
-               return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));\r
-        }\r
-\r
-        const Ptr2D src;\r
-    };\r
-}}}\r
+        arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);\r
+        arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);\r
+        arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);\r
+        arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);\r
+        \r
+        return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));\r
+    }\r
+\r
+    const Ptr2D src;\r
+};\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_FILTERS_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp

index a3fe1c2..49eda6f 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
@@ -44,35 +44,31 @@
  #ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_\r
  #define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_\r
  \r
-#include<cstdio>\r
+#include <cstdio>\r
+#include "internal_shared.hpp"\r
  \r
-namespace cv\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template<class Func> \r
+void printFuncAttrib(Func& func)\r
  {\r
-    namespace gpu\r
-    {\r
-        namespace device\r
-        {\r
-            template<class Func> \r
-            void printFuncAttrib(Func& func)\r
-            {\r
  \r
-                cudaFuncAttributes attrs;\r
-                cudaFuncGetAttributes(&attrs, func);  \r
+    cudaFuncAttributes attrs;\r
+    cudaFuncGetAttributes(&attrs, func);  \r
  \r
-                printf("=== Function stats ===\n");\r
-                printf("Name: \n");\r
-                printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);\r
-                printf("constSizeBytes     = %d\n", attrs.constSizeBytes);\r
-                printf("localSizeBytes     = %d\n", attrs.localSizeBytes);\r
-                printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);\r
-                printf("numRegs            = %d\n", attrs.numRegs);\r
-                printf("ptxVersion         = %d\n", attrs.ptxVersion);\r
-                printf("binaryVersion      = %d\n", attrs.binaryVersion);\r
-                printf("\n");\r
-                fflush(stdout); \r
-            }\r
-        }\r
-    }\r
+    printf("=== Function stats ===\n");\r
+    printf("Name: \n");\r
+    printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);\r
+    printf("constSizeBytes     = %d\n", attrs.constSizeBytes);\r
+    printf("localSizeBytes     = %d\n", attrs.localSizeBytes);\r
+    printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);\r
+    printf("numRegs            = %d\n", attrs.numRegs);\r
+    printf("ptxVersion         = %d\n", attrs.ptxVersion);\r
+    printf("binaryVersion      = %d\n", attrs.binaryVersion);\r
+    printf("\n");\r
+    fflush(stdout); \r
  }\r
  \r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
  #endif  /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
 \ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp

index 58af91d..5f29c0d 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -49,182 +49,182 @@
  #include "vec_traits.hpp"\r
  #include "type_traits.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    // Function Objects\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+// Function Objects\r
  \r
-    using thrust::unary_function;\r
-    using thrust::binary_function;\r
+using thrust::unary_function;\r
+using thrust::binary_function;\r
  \r
-    // Arithmetic Operations\r
+// Arithmetic Operations\r
  \r
-    template <typename T> struct plus : binary_function<T, T, T>\r
+template <typename T> struct plus : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a + b;\r
-        }\r
-    };\r
-    template <typename T> struct minus : binary_function<T, T, T>\r
+        return a + b;\r
+    }\r
+};\r
+template <typename T> struct minus : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a - b;\r
-        }\r
-    };\r
-    template <typename T> struct multiplies : binary_function<T, T, T>\r
+        return a - b;\r
+    }\r
+};\r
+template <typename T> struct multiplies : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a * b;\r
-        }\r
-    };\r
-    template <typename T> struct divides : binary_function<T, T, T>\r
+        return a * b;\r
+    }\r
+};\r
+template <typename T> struct divides : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a / b;\r
-        }\r
-    };\r
-    template <typename T> struct modulus : binary_function<T, T, T>\r
+        return a / b;\r
+    }\r
+};\r
+template <typename T> struct modulus : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a % b;\r
-        }\r
-    };\r
-    template <typename T> struct negate : unary_function<T, T>\r
+        return a % b;\r
+    }\r
+};\r
+template <typename T> struct negate : unary_function<T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const\r
-        {\r
-            return -a;\r
-        }\r
-    };\r
+        return -a;\r
+    }\r
+};\r
  \r
-    // Comparison Operations\r
-    \r
-    template <typename T> struct equal_to : binary_function<T, T, bool>\r
+// Comparison Operations\r
+\r
+template <typename T> struct equal_to : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a == b;\r
-        }\r
-    };\r
-    template <typename T> struct not_equal_to : binary_function<T, T, bool>\r
+        return a == b;\r
+    }\r
+};\r
+template <typename T> struct not_equal_to : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a != b;\r
-        }\r
-    };\r
-    template <typename T> struct greater : binary_function<T, T, bool>\r
+        return a != b;\r
+    }\r
+};\r
+template <typename T> struct greater : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a > b;\r
-        }\r
-    };\r
-    template <typename T> struct less : binary_function<T, T, bool>\r
+        return a > b;\r
+    }\r
+};\r
+template <typename T> struct less : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a < b;\r
-        }\r
-    };\r
-    template <typename T> struct greater_equal : binary_function<T, T, bool>\r
+        return a < b;\r
+    }\r
+};\r
+template <typename T> struct greater_equal : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a >= b;\r
-        }\r
-    };\r
-    template <typename T> struct less_equal : binary_function<T, T, bool>\r
+        return a >= b;\r
+    }\r
+};\r
+template <typename T> struct less_equal : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a <= b;\r
-        }\r
-    };\r
+        return a <= b;\r
+    }\r
+};\r
  \r
-    // Logical Operations\r
-    \r
-    template <typename T> struct logical_and : binary_function<T, T, bool>\r
+// Logical Operations\r
+\r
+template <typename T> struct logical_and : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a && b;\r
-        }\r
-    };\r
-    template <typename T> struct logical_or : binary_function<T, T, bool>\r
+        return a && b;\r
+    }\r
+};\r
+template <typename T> struct logical_or : binary_function<T, T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a || b;\r
-        }\r
-    };\r
-    template <typename T> struct logical_not : unary_function<T, bool>\r
+        return a || b;\r
+    }\r
+};\r
+template <typename T> struct logical_not : unary_function<T, bool>\r
+{\r
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const\r
      {\r
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const\r
-        {\r
-            return !a;\r
-        }\r
-    };\r
+        return !a;\r
+    }\r
+};\r
  \r
-    // Bitwise Operations\r
+// Bitwise Operations\r
  \r
-    template <typename T> struct bit_and : binary_function<T, T, T>\r
+template <typename T> struct bit_and : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a & b;\r
-        }\r
-    };\r
-    template <typename T> struct bit_or : binary_function<T, T, T>\r
+        return a & b;\r
+    }\r
+};\r
+template <typename T> struct bit_or : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a | b;\r
-        }\r
-    };\r
-    template <typename T> struct bit_xor : binary_function<T, T, T>\r
+        return a | b;\r
+    }\r
+};\r
+template <typename T> struct bit_xor : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const\r
-        {\r
-            return a ^ b;\r
-        }\r
-    };\r
-    template <typename T> struct bit_not : unary_function<T, T>\r
+        return a ^ b;\r
+    }\r
+};\r
+template <typename T> struct bit_not : unary_function<T, T>\r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const \r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const \r
-        {\r
-            return ~v;\r
-        }\r
-    };\r
+        return ~v;\r
+    }\r
+};\r
  \r
-    // Generalized Identity Operations\r
+// Generalized Identity Operations\r
  \r
-    template <typename T> struct identity : unary_function<T, T>\r
+template <typename T> struct identity : unary_function<T, T>\r
+{\r
+    __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const \r
      {\r
-        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const \r
-        {\r
-            return x;\r
-        }\r
-    };\r
+        return x;\r
+    }\r
+};\r
  \r
-    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>\r
+template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>\r
+{\r
+    __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
      {\r
-        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
-        {\r
-            return lhs;\r
-        }\r
-    };\r
-    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>\r
+        return lhs;\r
+    }\r
+};\r
+template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>\r
+{\r
+    __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
      {\r
-        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const \r
-        {\r
-            return rhs;\r
-        }\r
-    };\r
+        return rhs;\r
+    }\r
+};\r
  \r
      // Min/Max Operations\r
  \r
@@ -234,39 +234,39 @@ namespace cv { namespace gpu { namespace device
          __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \\r
      };\r
  \r
-    template <typename T> struct maximum : binary_function<T, T, T>\r
+template <typename T> struct maximum : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
      {\r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
-        {\r
-            return lhs < rhs ? rhs : lhs;\r
-        }\r
-    };\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, max)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, fmax)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, fmax)\r
-\r
-    template <typename T> struct minimum : binary_function<T, T, T>\r
-    {\r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
-        {\r
-            return lhs < rhs ? lhs : rhs;\r
-        }\r
-    };\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, min)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, fmin)\r
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)\r
+        return lhs < rhs ? rhs : lhs;\r
+    }\r
+};\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)\r
+\r
+template <typename T> struct minimum : binary_function<T, T, T>\r
+{\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const \r
+    {\r
+        return lhs < rhs ? lhs : rhs;\r
+    }\r
+};\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)\r
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_MINMAX\r
  \r
@@ -277,14 +277,14 @@ namespace cv { namespace gpu { namespace device
      { \\r
          __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \\r
          { \\r
-            return func ## f(v); \\r
+            return :: ## func ## f(v); \\r
          } \\r
      }; \\r
      template <> struct func ## _func<double> : unary_function<double, double> \\r
      { \\r
          __device__ __forceinline__ double operator ()(double v) const \\r
          { \\r
-            return func(v); \\r
+            return :: ## func(v); \\r
          } \\r
      };\r
  #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \\r
@@ -292,266 +292,270 @@ namespace cv { namespace gpu { namespace device
      { \\r
          __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \\r
          { \\r
-            return func ## f(v1, v2); \\r
+            return :: ## func ## f(v1, v2); \\r
          } \\r
      }; \\r
      template <> struct func ## _func<double> : binary_function<double, double, double> \\r
      { \\r
          __device__ __forceinline__ double operator ()(double v1, double v2) const \\r
          { \\r
-            return func(v1, v2); \\r
+            return :: ## func(v1, v2); \\r
          } \\r
      };\r
  \r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh)\r
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh)\r
-\r
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot)\r
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2)\r
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh)\r
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh)\r
+\r
+OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot)\r
+OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2)\r
+OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR\r
  #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR\r
  \r
-    template<typename T> struct hypot_sqr_func : binary_function<T, T, float> \r
+template<typename T> struct hypot_sqr_func : binary_function<T, T, float> \r
+{\r
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const\r
      {\r
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const\r
-        {\r
-            return src1 * src1 + src2 * src2;\r
-        }\r
-    };\r
+        return src1 * src1 + src2 * src2;\r
+    }\r
+};\r
  \r
-    // Saturate Cast Functor\r
+// Saturate Cast Functor\r
  \r
-    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>\r
+template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>\r
+{\r
+    __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const\r
      {\r
-        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const\r
-        {\r
-            return saturate_cast<D>(v);\r
-        }\r
-    };\r
+        return saturate_cast<D>(v);\r
+    }\r
+};\r
+\r
+// Threshold Functors\r
  \r
-    // Threshold Functors\r
+template <typename T> struct thresh_binary_func : unary_function<T, T>\r
+{\r
+    __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
  \r
-    template <typename T> struct thresh_binary_func : unary_function<T, T>\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
      {\r
-        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
+        return (src > thresh) * maxVal;\r
+    }\r
  \r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
-        {\r
-            return (src > thresh) * maxVal;\r
-        }\r
+    const T thresh;\r
+    const T maxVal;\r
+};\r
+template <typename T> struct thresh_binary_inv_func : unary_function<T, T>\r
+{\r
+    __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
  \r
-        const T thresh;\r
-        const T maxVal;\r
-    };\r
-    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
      {\r
-        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}\r
+        return (src <= thresh) * maxVal;\r
+    }\r
  \r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
-        {\r
-            return (src <= thresh) * maxVal;\r
-        }\r
+    const T thresh;\r
+    const T maxVal;\r
+};\r
+template <typename T> struct thresh_trunc_func : unary_function<T, T>\r
+{\r
+    explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
  \r
-        const T thresh;\r
-        const T maxVal;\r
-    };\r
-    template <typename T> struct thresh_trunc_func : unary_function<T, T>\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
      {\r
-        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
+        return minimum<T>()(src, thresh);\r
+    }\r
  \r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
-        {\r
-            return minimum<T>()(src, thresh);\r
-        }\r
+    const T thresh;\r
+};\r
+template <typename T> struct thresh_to_zero_func : unary_function<T, T>\r
+{\r
+    explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
  \r
-        const T thresh;\r
-    };\r
-    template <typename T> struct thresh_to_zero_func : unary_function<T, T>\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
      {\r
-        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
+        return (src > thresh) * src;\r
+    }\r
  \r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
-        {\r
-            return (src > thresh) * src;\r
-        }\r
+    const T thresh;\r
+};\r
+template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>\r
+{\r
+    explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
  \r
-        const T thresh;\r
-    };\r
-    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>\r
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
      {\r
-        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}\r
+        return (src <= thresh) * src;\r
+    }\r
  \r
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const\r
-        {\r
-            return (src <= thresh) * src;\r
-        }\r
+    const T thresh;\r
+};\r
  \r
-        const T thresh;\r
-    };    \r
+// Function Object Adaptors\r
  \r
-    // Function Object Adaptors\r
+template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>\r
+{\r
+  explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}\r
  \r
-    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>\r
-    {\r
-      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}\r
+  __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const\r
+  { \r
+      return !pred(x); \r
+  }\r
  \r
-      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const\r
-      { \r
-          return !pred(x); \r
-      }\r
+  const Predicate pred;\r
+};\r
+template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)\r
+{\r
+    return unary_negate<Predicate>(pred);\r
+}\r
  \r
-      const Predicate pred;\r
-    };\r
-    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)\r
-    {\r
-        return unary_negate<Predicate>(pred);\r
+template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>\r
+{\r
+    explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}\r
+\r
+    __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const\r
+    { \r
+        return !pred(x,y); \r
      }\r
  \r
-    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>\r
-    {\r
-        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}\r
+    const Predicate pred;\r
+};\r
+template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)\r
+{\r
+    return binary_negate<BinaryPredicate>(pred);\r
+}\r
  \r
-        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const\r
-        { \r
-            return !pred(x,y); \r
-        }\r
+template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> \r
+{\r
+    __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}\r
  \r
-        const Predicate pred;\r
-    };\r
-    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)\r
+    __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const\r
      {\r
-        return binary_negate<BinaryPredicate>(pred);\r
+        return op(arg1, a);\r
      }\r
  \r
-    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> \r
-    {\r
-        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}\r
+    const Op op;\r
+    const typename Op::first_argument_type arg1;\r
+};\r
+template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)\r
+{\r
+    return binder1st<Op>(op, typename Op::first_argument_type(x));\r
+}\r
  \r
-        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const\r
-        {\r
-            return op(arg1, a);\r
-        }\r
+template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> \r
+{\r
+    __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}\r
  \r
-        const Op op;\r
-        const typename Op::first_argument_type arg1;\r
-    };\r
-    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)\r
+    __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const\r
      {\r
-        return binder1st<Op>(op, typename Op::first_argument_type(x));\r
+        return op(a, arg2);\r
      }\r
  \r
-    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> \r
-    {\r
-        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}\r
+    const Op op;\r
+    const typename Op::second_argument_type arg2;\r
+};\r
+template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)\r
+{\r
+    return binder2nd<Op>(op, typename Op::second_argument_type(x));\r
+}\r
  \r
-        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const\r
-        {\r
-            return op(a, arg2);\r
-        }\r
+// Functor Traits\r
  \r
-        const Op op;\r
-        const typename Op::second_argument_type arg2;\r
-    };\r
-    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)\r
-    {\r
-        return binder2nd<Op>(op, typename Op::second_argument_type(x));\r
-    }\r
+template <typename F> struct IsUnaryFunction\r
+{\r
+    typedef char Yes;\r
+    struct No {Yes a[2];};\r
  \r
-    // Functor Traits\r
+    template <typename T, typename D> static Yes check(unary_function<T, D>);\r
+    static No check(...);\r
  \r
-    template <typename F> struct IsUnaryFunction\r
-    {\r
-        typedef char Yes;\r
-        struct No {Yes a[2];};\r
+    static F makeF();\r
  \r
-        template <typename T, typename D> static Yes check(unary_function<T, D>);\r
-        static No check(...);\r
+    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
+};\r
  \r
-        static F makeF();\r
+template <typename F> struct IsBinaryFunction\r
+{\r
+    typedef char Yes;\r
+    struct No {Yes a[2];};\r
  \r
-        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
-    };\r
+    template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);\r
+    static No check(...);\r
  \r
-    template <typename F> struct IsBinaryFunction\r
-    {\r
-        typedef char Yes;\r
-        struct No {Yes a[2];};\r
+    static F makeF();\r
  \r
-        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);\r
-        static No check(...);\r
+    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
+};\r
  \r
-        static F makeF();\r
+namespace detail\r
+{\r
+    template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };\r
+    template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };\r
+    template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };\r
  \r
-        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };\r
+    template <typename T, typename D> struct DefaultUnaryShift\r
+    {\r
+        enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };\r
      };\r
+    \r
+    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };\r
+    template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };\r
+    template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };\r
  \r
-    namespace detail\r
+    template <typename T1, typename T2, typename D> struct DefaultBinaryShift\r
      {\r
-        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };\r
-        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };\r
-        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };\r
+        enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };\r
+    };\r
  \r
-        template <typename T, typename D> struct DefaultUnaryShift\r
-        {\r
-            enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };\r
-        };\r
-        \r
-        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };\r
-        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };\r
-        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };\r
+    template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;\r
+    template <typename Func> struct ShiftDispatcher<Func, true>\r
+    {\r
+        enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };\r
+    };\r
+    template <typename Func> struct ShiftDispatcher<Func, false>\r
+    {\r
+        enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };\r
+    };\r
+}\r
  \r
-        template <typename T1, typename T2, typename D> struct DefaultBinaryShift\r
-        {\r
-            enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };\r
-        };\r
+template <typename Func> struct DefaultTransformShift\r
+{\r
+    enum { shift = detail::ShiftDispatcher<Func>::shift };\r
+};\r
  \r
-        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;\r
-        template <typename Func> struct ShiftDispatcher<Func, true>\r
-        {\r
-            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };\r
-        };\r
-        template <typename Func> struct ShiftDispatcher<Func, false>\r
-        {\r
-            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };\r
-        };\r
-    }\r
+template <typename Func> struct DefaultTransformFunctorTraits\r
+{\r
+    enum { simple_block_dim_x = 16 };\r
+    enum { simple_block_dim_y = 16 };\r
  \r
-    template <typename Func> struct DefaultTransformShift\r
-    {\r
-        enum { shift = detail::ShiftDispatcher<Func>::shift };\r
-    };\r
+    enum { smart_block_dim_x = 16 };\r
+    enum { smart_block_dim_y = 16 };\r
+    enum { smart_shift = DefaultTransformShift<Func>::shift };\r
+};\r
  \r
-    template <typename Func> struct DefaultTransformFunctorTraits\r
-    {\r
-        enum { simple_block_dim_x = 16 };\r
-        enum { simple_block_dim_y = 16 };\r
+template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};\r
  \r
-        enum { smart_block_dim_x = 16 };\r
-        enum { smart_block_dim_y = 16 };\r
-        enum { smart_shift = DefaultTransformShift<Func>::shift };\r
-    };\r
+#define DEFINE_TRANSFORM_FUNCTOR_TRAITS(type) \\r
+    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >\r
  \r
-    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_FUNCTIONAL_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/limits.hpp b/modules/gpu/src/opencv2/gpu/device/limits.hpp

index b0b73f2..b2e53e1 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@@ -43,190 +43,193 @@
  #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__\r
  #define __OPENCV_GPU_LIMITS_GPU_HPP__\r
  \r
-namespace cv { namespace gpu { namespace device\r
+#include "internal_shared.hpp"\r
+\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template<class T> struct numeric_limits\r
+{\r
+    typedef T type;\r
+    __device__ __forceinline__ static type min()  { return type(); };\r
+    __device__ __forceinline__ static type max() { return type(); };\r
+    __device__ __forceinline__ static type epsilon() { return type(); }\r
+    __device__ __forceinline__ static type round_error() { return type(); }\r
+    __device__ __forceinline__ static type denorm_min()  { return type(); }\r
+    __device__ __forceinline__ static type infinity() { return type(); }\r
+    __device__ __forceinline__ static type quiet_NaN() { return type(); }\r
+    __device__ __forceinline__ static type signaling_NaN() { return T(); }\r
+    static const bool is_signed;\r
+};\r
+\r
+template<> struct numeric_limits<bool>\r
+{\r
+    typedef bool type;\r
+    __device__ __forceinline__ static type min() { return false; };\r
+    __device__ __forceinline__ static type max() { return true;  };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = false;\r
+};\r
+\r
+template<> struct numeric_limits<char>\r
+{\r
+    typedef char type;\r
+    __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
+    __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = (char)-1 == -1;\r
+};\r
+\r
+ template<> struct numeric_limits<signed char>\r
+{\r
+    typedef char type;\r
+    __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
+    __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = (signed char)-1 == -1;\r
+};\r
+\r
+template<> struct numeric_limits<unsigned char>\r
+{\r
+    typedef unsigned char type;\r
+    __device__ __forceinline__ static type min() { return 0; };\r
+    __device__ __forceinline__ static type max() { return UCHAR_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = false;\r
+};\r
+\r
+template<> struct numeric_limits<short>\r
+{\r
+    typedef short type;\r
+    __device__ __forceinline__ static type min() { return SHRT_MIN; };\r
+    __device__ __forceinline__ static type max() { return SHRT_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = true;\r
+};\r
+\r
+template<> struct numeric_limits<unsigned short>\r
+{\r
+    typedef unsigned short type;\r
+    __device__ __forceinline__ static type min() { return 0; };\r
+    __device__ __forceinline__ static type max() { return USHRT_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = false;\r
+};\r
+\r
+template<> struct numeric_limits<int>\r
+{\r
+    typedef int type;\r
+    __device__ __forceinline__ static type min() { return INT_MIN; };\r
+    __device__ __forceinline__ static type max() { return INT_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = true;\r
+};\r
+\r
+\r
+template<> struct numeric_limits<unsigned int>\r
+{\r
+    typedef unsigned int type;\r
+    __device__ __forceinline__ static type min() { return 0; };\r
+    __device__ __forceinline__ static type max() { return UINT_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = false;\r
+};\r
+\r
+template<> struct numeric_limits<long>\r
+{\r
+    typedef long type;\r
+    __device__ __forceinline__ static type min() { return LONG_MIN; };\r
+    __device__ __forceinline__ static type max() { return LONG_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = true;\r
+};\r
+\r
+template<> struct numeric_limits<unsigned long>\r
+{\r
+    typedef unsigned long type;\r
+    __device__ __forceinline__ static type min() { return 0; };\r
+    __device__ __forceinline__ static type max() { return ULONG_MAX; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = false;\r
+};\r
+\r
+template<> struct numeric_limits<float>\r
+{\r
+    typedef float type;\r
+    __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };\r
+    __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };\r
+    __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = true;\r
+};\r
+\r
+template<> struct numeric_limits<double>\r
  {\r
-    template<class T> struct numeric_limits\r
-    {\r
-        typedef T type;\r
-        __device__ __forceinline__ static type min()  { return type(); };\r
-        __device__ __forceinline__ static type max() { return type(); };\r
-        __device__ __forceinline__ static type epsilon() { return type(); }\r
-        __device__ __forceinline__ static type round_error() { return type(); }\r
-        __device__ __forceinline__ static type denorm_min()  { return type(); }\r
-        __device__ __forceinline__ static type infinity() { return type(); }\r
-        __device__ __forceinline__ static type quiet_NaN() { return type(); }\r
-        __device__ __forceinline__ static type signaling_NaN() { return T(); }\r
-        static const bool is_signed;\r
-    };\r
-\r
-    template<> struct numeric_limits<bool>\r
-    {\r
-        typedef bool type;\r
-        __device__ __forceinline__ static type min() { return false; };\r
-        __device__ __forceinline__ static type max() { return true;  };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = false;\r
-    };\r
-\r
-    template<> struct numeric_limits<char>\r
-    {\r
-        typedef char type;\r
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = (char)-1 == -1;\r
-    };\r
-\r
-     template<> struct numeric_limits<signed char>\r
-    {\r
-        typedef char type;\r
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };\r
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = (signed char)-1 == -1;\r
-    };\r
-\r
-    template<> struct numeric_limits<unsigned char>\r
-    {\r
-        typedef unsigned char type;\r
-        __device__ __forceinline__ static type min() { return 0; };\r
-        __device__ __forceinline__ static type max() { return UCHAR_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = false;\r
-    };\r
-\r
-    template<> struct numeric_limits<short>\r
-    {\r
-        typedef short type;\r
-        __device__ __forceinline__ static type min() { return SHRT_MIN; };\r
-        __device__ __forceinline__ static type max() { return SHRT_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = true;\r
-    };\r
-\r
-    template<> struct numeric_limits<unsigned short>\r
-    {\r
-        typedef unsigned short type;\r
-        __device__ __forceinline__ static type min() { return 0; };\r
-        __device__ __forceinline__ static type max() { return USHRT_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = false;\r
-    };\r
-\r
-    template<> struct numeric_limits<int>\r
-    {\r
-        typedef int type;\r
-        __device__ __forceinline__ static type min() { return INT_MIN; };\r
-        __device__ __forceinline__ static type max() { return INT_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = true;\r
-    };\r
-\r
-\r
-    template<> struct numeric_limits<unsigned int>\r
-    {\r
-        typedef unsigned int type;\r
-        __device__ __forceinline__ static type min() { return 0; };\r
-        __device__ __forceinline__ static type max() { return UINT_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = false;\r
-    };\r
-\r
-    template<> struct numeric_limits<long>\r
-    {\r
-        typedef long type;\r
-        __device__ __forceinline__ static type min() { return LONG_MIN; };\r
-        __device__ __forceinline__ static type max() { return LONG_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = true;\r
-    };\r
-\r
-    template<> struct numeric_limits<unsigned long>\r
-    {\r
-        typedef unsigned long type;\r
-        __device__ __forceinline__ static type min() { return 0; };\r
-        __device__ __forceinline__ static type max() { return ULONG_MAX; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = false;\r
-    };\r
-\r
-    template<> struct numeric_limits<float>\r
-    {\r
-        typedef float type;\r
-        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };\r
-        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };\r
-        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = true;\r
-    };\r
-\r
-    template<> struct numeric_limits<double>\r
-    {\r
-        typedef double type;\r
-        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };\r
-        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };\r
-        __device__ __forceinline__ static type epsilon();\r
-        __device__ __forceinline__ static type round_error();\r
-        __device__ __forceinline__ static type denorm_min();\r
-        __device__ __forceinline__ static type infinity();\r
-        __device__ __forceinline__ static type quiet_NaN();\r
-        __device__ __forceinline__ static type signaling_NaN();\r
-        static const bool is_signed = true;\r
-    };\r
-}}}\r
+    typedef double type;\r
+    __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };\r
+    __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };\r
+    __device__ __forceinline__ static type epsilon();\r
+    __device__ __forceinline__ static type round_error();\r
+    __device__ __forceinline__ static type denorm_min();\r
+    __device__ __forceinline__ static type infinity();\r
+    __device__ __forceinline__ static type quiet_NaN();\r
+    __device__ __forceinline__ static type signaling_NaN();\r
+    static const bool is_signed = true;\r
+};\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_LIMITS_GPU_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp

index ca6159f..34cbbbf 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -45,122 +45,173 @@
  \r
  #include "internal_shared.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }\r
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }\r
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }\r
  \r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)\r
-    { return (uchar)max((int)v, 0); }\r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)\r
-    { return (uchar)min((uint)v, (uint)UCHAR_MAX); }\r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)\r
-    { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }\r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)\r
-    { return (uchar)min(v, (uint)UCHAR_MAX); }\r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)\r
-    { return saturate_cast<uchar>((uint)v); }\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)\r
+{ \r
+    return (uchar) ::max((int)v, 0); \r
+}\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)\r
+{ \r
+    return (uchar) ::min((uint)v, (uint)UCHAR_MAX); \r
+}\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)\r
+{ \r
+    return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); \r
+}\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)\r
+{ \r
+    return (uchar) ::min(v, (uint)UCHAR_MAX); \r
+}\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)\r
+{ \r
+    return saturate_cast<uchar>((uint)v); \r
+}\r
+\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)\r
+{ \r
+    int iv = __float2int_rn(v); \r
+    return saturate_cast<uchar>(iv); \r
+}\r
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)\r
+{\r
+#if __CUDA_ARCH__ >= 130\r
+    int iv = __double2int_rn(v); \r
+    return saturate_cast<uchar>(iv);\r
+#else\r
+    return saturate_cast<uchar>((float)v);\r
+#endif\r
+}\r
  \r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)\r
-    { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }\r
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)\r
-    {\r
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130\r
-        int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);\r
-    #else\r
-        return saturate_cast<uchar>((float)v);\r
-    #endif\r
-    }\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)\r
+{ \r
+    return (schar) ::min((int)v, SCHAR_MAX); \r
+}\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)\r
+{ \r
+    return (schar) ::min((uint)v, (uint)SCHAR_MAX); \r
+}\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)\r
+{\r
+    return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);\r
+}\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)\r
+{ \r
+    return saturate_cast<schar>((int)v); \r
+}\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)\r
+{ \r
+    return (schar) ::min(v, (uint)SCHAR_MAX); \r
+}\r
  \r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)\r
-    { return (schar)min((int)v, SCHAR_MAX); }\r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)\r
-    { return (schar)min((uint)v, (uint)SCHAR_MAX); }\r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)\r
-    {\r
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?\r
-                    v : v > 0 ? SCHAR_MAX : SCHAR_MIN);\r
-    }\r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)\r
-    { return saturate_cast<schar>((int)v); }\r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)\r
-    { return (schar)min(v, (uint)SCHAR_MAX); }\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)\r
+{ \r
+    int iv = __float2int_rn(v); \r
+    return saturate_cast<schar>(iv); \r
+}\r
+template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)\r
+{             \r
+#if __CUDA_ARCH__ >= 130\r
+    int iv = __double2int_rn(v); \r
+    return saturate_cast<schar>(iv);\r
+#else\r
+    return saturate_cast<schar>((float)v);\r
+#endif\r
+}\r
  \r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)\r
-    { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }\r
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)\r
-    {             \r
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130\r
-        int iv = __double2int_rn(v); return saturate_cast<schar>(iv);\r
-    #else\r
-        return saturate_cast<schar>((float)v);\r
-    #endif\r
-    }\r
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)\r
+{ \r
+    return (ushort) ::max((int)v, 0); \r
+}\r
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)\r
+{ \r
+    return (ushort) ::max((int)v, 0); \r
+}\r
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)\r
+{ \r
+    return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); \r
+}\r
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)\r
+{ \r
+    return (ushort) ::min(v, (uint)USHRT_MAX); \r
+}\r
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)\r
+{\r
+    int iv = __float2int_rn(v); \r
+    return saturate_cast<ushort>(iv); \r
+}\r
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)\r
+{             \r
+#if __CUDA_ARCH__ >= 130\r
+    int iv = __double2int_rn(v); \r
+    return saturate_cast<ushort>(iv);\r
+#else\r
+    return saturate_cast<ushort>((float)v);\r
+#endif\r
+}\r
  \r
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)\r
-    { return (ushort)max((int)v, 0); }\r
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)\r
-    { return (ushort)max((int)v, 0); }\r
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)\r
-    { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }\r
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)\r
-    { return (ushort)min(v, (uint)USHRT_MAX); }\r
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)\r
-    { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }\r
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)\r
-    {             \r
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130\r
-        int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);\r
-    #else\r
-        return saturate_cast<ushort>((float)v);\r
-    #endif\r
-    }\r
+template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)\r
+{ \r
+    return (short) ::min((int)v, SHRT_MAX); \r
+}\r
+template<> __device__ __forceinline__ short saturate_cast<short>(int v)\r
+{\r
+    return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);\r
+}\r
+template<> __device__ __forceinline__ short saturate_cast<short>(uint v)\r
+{ \r
+    return (short) ::min(v, (uint)SHRT_MAX); \r
+}\r
+template<> __device__ __forceinline__ short saturate_cast<short>(float v)\r
+{ \r
+    int iv = __float2int_rn(v); \r
+    return saturate_cast<short>(iv); \r
+}\r
+template<> __device__ __forceinline__ short saturate_cast<short>(double v)\r
+{            \r
+#if __CUDA_ARCH__ >= 130\r
+    int iv = __double2int_rn(v); \r
+    return saturate_cast<short>(iv);\r
+#else\r
+    return saturate_cast<short>((float)v);\r
+#endif\r
+}\r
  \r
-    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)\r
-    { return (short)min((int)v, SHRT_MAX); }\r
-    template<> __device__ __forceinline__ short saturate_cast<short>(int v)\r
-    {\r
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?\r
-                v : v > 0 ? SHRT_MAX : SHRT_MIN);\r
-    }\r
-    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)\r
-    { return (short)min(v, (uint)SHRT_MAX); }\r
-    template<> __device__ __forceinline__ short saturate_cast<short>(float v)\r
-    { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }\r
-    template<> __device__ __forceinline__ short saturate_cast<short>(double v)\r
-    {            \r
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130\r
-        int iv = __double2int_rn(v); return saturate_cast<short>(iv);\r
-    #else\r
-        return saturate_cast<short>((float)v);\r
-    #endif\r
-    }\r
+template<> __device__ __forceinline__ int saturate_cast<int>(float v) \r
+{ \r
+    return __float2int_rn(v); \r
+}\r
+template<> __device__ __forceinline__ int saturate_cast<int>(double v) \r
+{\r
+#if __CUDA_ARCH__ >= 130 \r
+    return __double2int_rn(v);\r
+#else\r
+    return saturate_cast<int>((float)v);\r
+#endif\r
+}\r
  \r
-    template<> __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }\r
-    template<> __device__ __forceinline__ int saturate_cast<int>(double v) \r
-    {\r
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 \r
-        return __double2int_rn(v);\r
-    #else\r
-        return saturate_cast<int>((float)v);\r
-    #endif\r
-    }\r
+template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)\r
+{ \r
+    return __float2uint_rn(v); \r
+}\r
+template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) \r
+{            \r
+#if __CUDA_ARCH__ >= 130\r
+    return __double2uint_rn(v);\r
+#else\r
+    return saturate_cast<uint>((float)v);\r
+#endif\r
+}\r
  \r
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }\r
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) \r
-    {            \r
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130\r
-        return __double2uint_rn(v);\r
-    #else\r
-        return saturate_cast<uint>((float)v);\r
-    #endif\r
-    }\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
 \ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp

index 4787292..81f427d 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -43,32 +43,34 @@
  #ifndef __OPENCV_GPU_TRANSFORM_HPP__\r
  #define __OPENCV_GPU_TRANSFORM_HPP__\r
  \r
-#include "detail/transform_detail.hpp"\r
+#include "internal_shared.hpp"\r
  #include "utility.hpp"\r
+#include "detail/transform_detail.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T, typename D, typename UnOp>\r
+void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)\r
+{\r
+    detail::transform_caller(src, dst, op, WithOutMask(), stream);\r
+}\r
+template <typename T, typename D, typename UnOp>\r
+void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)\r
+{\r
+    detail::transform_caller(src, dst, op, SingleMask(mask), stream);\r
+}\r
+\r
+template <typename T1, typename T2, typename D, typename BinOp>\r
+void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)\r
+{\r
+    detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);\r
+}\r
+template <typename T1, typename T2, typename D, typename BinOp>\r
+void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)\r
  {\r
-    template <typename T, typename D, typename UnOp>\r
-    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)\r
-    {\r
-        detail::transform_caller(src, dst, op, WithOutMask(), stream);\r
-    }\r
-    template <typename T, typename D, typename UnOp>\r
-    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)\r
-    {\r
-        detail::transform_caller(src, dst, op, SingleMask(mask), stream);\r
-    }\r
+    detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);\r
+}\r
  \r
-    template <typename T1, typename T2, typename D, typename BinOp>\r
-    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)\r
-    {\r
-        detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);\r
-    }\r
-    template <typename T1, typename T2, typename D, typename BinOp>\r
-    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)\r
-    {\r
-        detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);\r
-    }\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_TRANSFORM_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp

index 24f02ef..9553d22 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@@ -43,38 +43,40 @@
  #ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__\r
  #define __OPENCV_GPU_TYPE_TRAITS_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  #include "detail/type_traits_detail.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T> struct IsSimpleParameter\r
+{\r
+    enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};\r
+};\r
+\r
+template <typename T> struct TypeTraits\r
  {\r
-    template <typename T> struct IsSimpleParameter\r
-    {\r
-        enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};\r
-    };\r
+    typedef typename detail::UnConst<T>::type                                       NonConstType;\r
+    typedef typename detail::UnVolatile<T>::type                                    NonVolatileType;\r
+    typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type    UnqualifiedType;\r
+    typedef typename detail::PointerTraits<UnqualifiedType>::type                   PointeeType;\r
+    typedef typename detail::ReferenceTraits<T>::type                               ReferredType;\r
  \r
-    template <typename T> struct TypeTraits\r
-    {\r
-        typedef typename detail::UnConst<T>::type                                       NonConstType;\r
-        typedef typename detail::UnVolatile<T>::type                                    NonVolatileType;\r
-        typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type    UnqualifiedType;\r
-        typedef typename detail::PointerTraits<UnqualifiedType>::type                   PointeeType;\r
-        typedef typename detail::ReferenceTraits<T>::type                               ReferredType;\r
+    enum { isConst          = detail::UnConst<T>::value };\r
+    enum { isVolatile       = detail::UnVolatile<T>::value };\r
  \r
-        enum { isConst          = detail::UnConst<T>::value };\r
-        enum { isVolatile       = detail::UnVolatile<T>::value };\r
+    enum { isReference      = detail::ReferenceTraits<UnqualifiedType>::value };\r
+    enum { isPointer        = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };        \r
  \r
-        enum { isReference      = detail::ReferenceTraits<UnqualifiedType>::value };\r
-        enum { isPointer        = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };        \r
+    enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };\r
+    enum { isSignedInt   = detail::IsSignedIntergral<UnqualifiedType>::value };\r
+    enum { isIntegral    = detail::IsIntegral<UnqualifiedType>::value };\r
+    enum { isFloat       = detail::IsFloat<UnqualifiedType>::value  };\r
+    enum { isArith       = isIntegral || isFloat };\r
+    enum { isVec         = detail::IsVec<UnqualifiedType>::value  };\r
+    \r
+    typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;\r
+};\r
  \r
-        enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };\r
-        enum { isSignedInt   = detail::IsSignedIntergral<UnqualifiedType>::value };\r
-        enum { isIntegral    = detail::IsIntegral<UnqualifiedType>::value };\r
-        enum { isFloat       = detail::IsFloat<UnqualifiedType>::value  };\r
-        enum { isArith       = isIntegral || isFloat };\r
-        enum { isVec         = detail::IsVec<UnqualifiedType>::value  };\r
-        \r
-        typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;\r
-    };\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_TYPE_TRAITS_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp

index aaa8052..0acdd78 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -48,152 +48,168 @@
  #include "datamov_utils.hpp"\r
  #include "detail/utility_detail.hpp"\r
  \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
  #define OPENCV_GPU_LOG_WARP_SIZE           (5)\r
  #define OPENCV_GPU_WARP_SIZE           (1 << OPENCV_GPU_LOG_WARP_SIZE)\r
  #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla\r
  #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)\r
  \r
-namespace cv {  namespace gpu { namespace device\r
+///////////////////////////////////////////////////////////////////////////////\r
+// swap\r
+\r
+template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) \r
  {\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // swap\r
+    const T temp = a;\r
+    a = b;\r
+    b = temp;\r
+}\r
  \r
-    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) \r
-    {\r
-        const T temp = a;\r
-        a = b;\r
-        b = temp;\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Mask Reader\r
+\r
+struct SingleMask\r
+{\r
+    explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}\r
+    \r
+    __device__ __forceinline__ bool operator()(int y, int x) const\r
+    {            \r
+        return mask.ptr(y)[x] != 0;\r
      }\r
  \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Mask Reader\r
+    const PtrStepb mask;\r
+};\r
+\r
+struct MaskCollection\r
+{\r
+    explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}\r
  \r
-    struct SingleMask\r
+    __device__ __forceinline__ void next()\r
+    {\r
+        curMask = *maskCollection++;\r
+    }\r
+    __device__ __forceinline__ void setMask(int z)\r
      {\r
-        explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}\r
-        \r
-        __device__ __forceinline__ bool operator()(int y, int x) const\r
-        {            \r
-            return mask.ptr(y)[x] != 0;\r
-        }\r
+        curMask = maskCollection[z];\r
+    }\r
+    \r
+    __device__ __forceinline__ bool operator()(int y, int x) const\r
+    {\r
+        uchar val;\r
+        return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));\r
+    }\r
  \r
-        const PtrStepb mask;\r
-    };\r
+    const PtrStepb* maskCollection;\r
+    PtrStepb curMask;\r
+};\r
  \r
-    struct MaskCollection\r
+struct WithOutMask\r
+{\r
+    __device__ __forceinline__ void next() const\r
      {\r
-        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}\r
-\r
-        __device__ __forceinline__ void next()\r
-        {\r
-            curMask = *maskCollection++;\r
-        }\r
-        __device__ __forceinline__ void setMask(int z)\r
-        {\r
-            curMask = maskCollection[z];\r
-        }\r
-        \r
-        __device__ __forceinline__ bool operator()(int y, int x) const\r
-        {\r
-            uchar val;\r
-            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));\r
-        }\r
-\r
-        const PtrStepb* maskCollection;\r
-        PtrStepb curMask;\r
-    };\r
-\r
-    struct WithOutMask\r
+    }\r
+    __device__ __forceinline__ void setMask(int) const\r
      {\r
-        __device__ __forceinline__ void next() const\r
-        {\r
-        }\r
-        __device__ __forceinline__ void setMask(int) const\r
-        {\r
-        }\r
-\r
-        __device__ __forceinline__ bool operator()(int, int) const\r
-        {\r
-            return true;\r
-        }\r
-    };\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Reduction\r
-\r
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+    }\r
+\r
+    __device__ __forceinline__ bool operator()(int, int) const\r
      {\r
-        StaticAssert<n >= 8 && n <= 512>::check();\r
-        detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);\r
+        return true;\r
      }\r
  \r
-    template <int n, typename T, typename V, typename Pred> \r
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)\r
+    __device__ __forceinline__ bool operator()(int, int, int) const\r
      {\r
-        StaticAssert<n >= 8 && n <= 512>::check();\r
-        detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);\r
+        return true;\r
      }\r
  \r
-    template <int n, typename T, typename V1, typename V2, typename Pred> \r
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)\r
+    static __device__ __forceinline__ bool check(int, int)\r
      {\r
-        StaticAssert<n >= 8 && n <= 512>::check();\r
-        detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
+        return true;\r
      }\r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Solve linear system\r
  \r
-    // solve 2x2 linear system Ax=b\r
-    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])\r
+    static __device__ __forceinline__ bool check(int, int, int)\r
      {\r
-        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];\r
+        return true;\r
+    }\r
+};\r
  \r
-        if (det != 0)\r
-        {\r
-            double invdet = 1.0 / det;\r
+///////////////////////////////////////////////////////////////////////////////\r
+// Reduction\r
  \r
-            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));\r
+template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)\r
+{\r
+    StaticAssert<n >= 8 && n <= 512>::check();\r
+    detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);\r
+}\r
  \r
-            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));\r
+template <int n, typename T, typename V, typename Pred> \r
+__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)\r
+{\r
+    StaticAssert<n >= 8 && n <= 512>::check();\r
+    detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);\r
+}\r
  \r
-            return true;\r
-        }\r
+template <int n, typename T, typename V1, typename V2, typename Pred> \r
+__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)\r
+{\r
+    StaticAssert<n >= 8 && n <= 512>::check();\r
+    detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);\r
+}\r
+    \r
+///////////////////////////////////////////////////////////////////////////////\r
+// Solve linear system\r
  \r
-        return false;\r
-    }\r
+// solve 2x2 linear system Ax=b\r
+template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])\r
+{\r
+    T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];\r
  \r
-    // solve 3x3 linear system Ax=b\r
-    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])\r
+    if (det != 0)\r
      {\r
-        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])\r
-              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])\r
-              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);\r
+        double invdet = 1.0 / det;\r
  \r
-        if (det != 0)\r
-        {\r
-            double invdet = 1.0 / det;\r
+        x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));\r
  \r
-            x[0] = saturate_cast<T>(invdet * \r
-                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -\r
-                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +\r
-                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));\r
+        x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));\r
  \r
-            x[1] = saturate_cast<T>(invdet * \r
-                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -\r
-                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +\r
-                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));\r
+        return true;\r
+    }\r
  \r
-            x[2] = saturate_cast<T>(invdet * \r
-                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -\r
-                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +\r
-                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));\r
+    return false;\r
+}\r
  \r
-            return true;\r
-        }\r
+// solve 3x3 linear system Ax=b\r
+template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])\r
+{\r
+    T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])\r
+          - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])\r
+          + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);\r
+\r
+    if (det != 0)\r
+    {\r
+        double invdet = 1.0 / det;\r
+\r
+        x[0] = saturate_cast<T>(invdet * \r
+            (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -\r
+             A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +\r
+             A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));\r
  \r
-        return false;\r
+        x[1] = saturate_cast<T>(invdet * \r
+            (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -\r
+             b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +\r
+             A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));\r
+\r
+        x[2] = saturate_cast<T>(invdet * \r
+            (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -\r
+             A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +\r
+             b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));\r
+\r
+        return true;\r
      }\r
-}}}\r
+\r
+    return false;\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_UTILITY_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp

index 5831ae5..3b84fa5 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -48,179 +48,179 @@
  #include "functional.hpp"\r
  #include "detail/vec_distance_detail.hpp"\r
  \r
-namespace cv {  namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+template <typename T> struct L1Dist\r
  {\r
+    typedef int value_type;\r
+    typedef int result_type;\r
+\r
+    __device__ __forceinline__ L1Dist() : mySum(0) {}\r
  \r
-    template <typename T> struct L1Dist\r
+    __device__ __forceinline__ void reduceIter(int val1, int val2)\r
      {\r
-        typedef int value_type;\r
-        typedef int result_type;\r
+        mySum = __sad(val1, val2, mySum);\r
+    }\r
  \r
-        __device__ __forceinline__ L1Dist() : mySum(0) {}\r
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
+    {\r
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
+    }\r
  \r
-        __device__ __forceinline__ void reduceIter(int val1, int val2)\r
-        {\r
-            mySum = __sad(val1, val2, mySum);\r
-        }\r
+    __device__ __forceinline__ operator int() const\r
+    {\r
+        return mySum;\r
+    }\r
  \r
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
-        {\r
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
-        }\r
+    int mySum;\r
+};\r
+template <> struct L1Dist<float>\r
+{\r
+    typedef float value_type;\r
+    typedef float result_type;\r
  \r
-        __device__ __forceinline__ operator int() const\r
-        {\r
-            return mySum;\r
-        }\r
+    __device__ __forceinline__ L1Dist() : mySum(0.0f) {}\r
  \r
-        int mySum;\r
-    };\r
-    template <> struct L1Dist<float>\r
+    __device__ __forceinline__ void reduceIter(float val1, float val2)\r
      {\r
-        typedef float value_type;\r
-        typedef float result_type;\r
+        mySum += ::fabs(val1 - val2);\r
+    }\r
  \r
-        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}\r
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
+    {\r
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
+    }\r
  \r
-        __device__ __forceinline__ void reduceIter(float val1, float val2)\r
-        {\r
-            mySum += ::fabs(val1 - val2);\r
-        }\r
+    __device__ __forceinline__ operator float() const\r
+    {\r
+        return mySum;\r
+    }\r
  \r
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
-        {\r
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
-        }\r
+    float mySum;\r
+};\r
  \r
-        __device__ __forceinline__ operator float() const\r
-        {\r
-            return mySum;\r
-        }\r
+struct L2Dist\r
+{\r
+    typedef float value_type;\r
+    typedef float result_type;\r
  \r
-        float mySum;\r
-    };\r
+    __device__ __forceinline__ L2Dist() : mySum(0.0f) {}\r
  \r
-    struct L2Dist\r
+    __device__ __forceinline__ void reduceIter(float val1, float val2)\r
      {\r
-        typedef float value_type;\r
-        typedef float result_type;\r
+        float reg = val1 - val2;\r
+        mySum += reg * reg;\r
+    }\r
  \r
-        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}\r
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
+    {\r
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
+    }\r
  \r
-        __device__ __forceinline__ void reduceIter(float val1, float val2)\r
-        {\r
-            float reg = val1 - val2;\r
-            mySum += reg * reg;\r
-        }\r
+    __device__ __forceinline__ operator float() const\r
+    {\r
+        return sqrtf(mySum);\r
+    }\r
  \r
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)\r
-        {\r
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());\r
-        }\r
+    float mySum;\r
+};\r
  \r
-        __device__ __forceinline__ operator float() const\r
-        {\r
-            return sqrtf(mySum);\r
-        }\r
+struct HammingDist\r
+{\r
+    typedef int value_type;\r
+    typedef int result_type;\r
  \r
-        float mySum;\r
-    };\r
+    __device__ __forceinline__ HammingDist() : mySum(0) {}\r
  \r
-    struct HammingDist\r
+    __device__ __forceinline__ void reduceIter(int val1, int val2)\r
      {\r
-        typedef int value_type;\r
-        typedef int result_type;\r
-\r
-        __device__ __forceinline__ HammingDist() : mySum(0) {}\r
-\r
-        __device__ __forceinline__ void reduceIter(int val1, int val2)\r
-        {\r
-            mySum += __popc(val1 ^ val2);\r
-        }\r
+        mySum += __popc(val1 ^ val2);\r
+    }\r
  \r
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
-        {\r
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
-        }\r
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)\r
+    {\r
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());\r
+    }\r
  \r
-        __device__ __forceinline__ operator int() const\r
-        {\r
-            return mySum;\r
-        }\r
+    __device__ __forceinline__ operator int() const\r
+    {\r
+        return mySum;\r
+    }\r
  \r
-        int mySum;\r
-    };\r
+    int mySum;\r
+};\r
  \r
-    // calc distance between two vectors in global memory\r
-    template <int THREAD_DIM, typename Dist, typename T1, typename T2> \r
-    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
+// calc distance between two vectors in global memory\r
+template <int THREAD_DIM, typename Dist, typename T1, typename T2> \r
+__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
+{\r
+    for (int i = tid; i < len; i += THREAD_DIM)\r
      {\r
-        for (int i = tid; i < len; i += THREAD_DIM)\r
-        {\r
-            T1 val1;\r
-            ForceGlob<T1>::Load(vec1, i, val1);\r
+        T1 val1;\r
+        ForceGlob<T1>::Load(vec1, i, val1);\r
  \r
-            T2 val2;\r
-            ForceGlob<T2>::Load(vec2, i, val2);\r
+        T2 val2;\r
+        ForceGlob<T2>::Load(vec2, i, val2);\r
  \r
-            dist.reduceIter(val1, val2);\r
-        }\r
-\r
-        dist.reduceAll<THREAD_DIM>(smem, tid);\r
+        dist.reduceIter(val1, val2);\r
      }\r
  \r
-    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory\r
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>\r
-    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
-    {        \r
-        detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);\r
-        \r
-        dist.reduceAll<THREAD_DIM>(smem, tid);\r
-    }\r
+    dist.reduceAll<THREAD_DIM>(smem, tid);\r
+}\r
  \r
-    // calc distance between two vectors in global memory\r
-    template <int THREAD_DIM, typename T1> struct VecDiffGlobal\r
+// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory\r
+template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>\r
+__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)\r
+{        \r
+    detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);\r
+    \r
+    dist.reduceAll<THREAD_DIM>(smem, tid);\r
+}\r
+\r
+// calc distance between two vectors in global memory\r
+template <int THREAD_DIM, typename T1> struct VecDiffGlobal\r
+{\r
+    explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)\r
      {\r
-        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)\r
-        {\r
-            vec1 = vec1_;\r
-        }\r
+        vec1 = vec1_;\r
+    }\r
  \r
-        template <typename T2, typename Dist>\r
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
-        {\r
-            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);\r
-        }\r
+    template <typename T2, typename Dist>\r
+    __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
+    {\r
+        calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);\r
+    }\r
  \r
-        const T1* vec1;\r
-    };\r
+    const T1* vec1;\r
+};\r
  \r
-    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory\r
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister\r
+// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory\r
+template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister\r
+{\r
+    template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)\r
      {\r
-        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)\r
-        {\r
-            if (glob_tid < len)\r
-                smem[glob_tid] = vec1[glob_tid];\r
-            __syncthreads();\r
+        if (glob_tid < len)\r
+            smem[glob_tid] = vec1[glob_tid];\r
+        __syncthreads();\r
+\r
+        U* vec1ValsPtr = vec1Vals;\r
  \r
-            U* vec1ValsPtr = vec1Vals;\r
+        #pragma unroll\r
+        for (int i = tid; i < MAX_LEN; i += THREAD_DIM)\r
+            *vec1ValsPtr++ = smem[i];\r
  \r
-            #pragma unroll\r
-            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)\r
-                *vec1ValsPtr++ = smem[i];\r
+        __syncthreads();\r
+    }\r
  \r
-            __syncthreads();\r
-        }\r
+    template <typename T2, typename Dist>\r
+    __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
+    {\r
+        calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);\r
+    }\r
  \r
-        template <typename T2, typename Dist>\r
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const\r
-        {\r
-            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);\r
-        }\r
+    U vec1Vals[MAX_LEN / THREAD_DIM];\r
+};\r
  \r
-        U vec1Vals[MAX_LEN / THREAD_DIM];\r
-    };\r
-}}}\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_VEC_DISTANCE_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp

index 48aa62f..8fda041 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -48,85 +48,85 @@
  #include "vec_traits.hpp"\r
  #include "functional.hpp"\r
  \r
-namespace cv {  namespace gpu { namespace device\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace detail\r
  {\r
-    namespace detail\r
+    template <int cn, typename VecD> struct SatCastHelper;\r
+    template <typename VecD> struct SatCastHelper<1, VecD>\r
      {\r
-        template <int cn, typename VecD> struct SatCastHelper;\r
-        template <typename VecD> struct SatCastHelper<1, VecD>\r
-        {\r
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
-            {\r
-                typedef typename VecTraits<VecD>::elem_type D;\r
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x));\r
-            }\r
-        };\r
-        template <typename VecD> struct SatCastHelper<2, VecD>\r
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
          {\r
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
-            {\r
-                typedef typename VecTraits<VecD>::elem_type D;\r
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));\r
-            }\r
-        };\r
-        template <typename VecD> struct SatCastHelper<3, VecD>\r
+            typedef typename VecTraits<VecD>::elem_type D;\r
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));\r
+        }\r
+    };\r
+    template <typename VecD> struct SatCastHelper<2, VecD>\r
+    {\r
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
          {\r
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
-            {\r
-                typedef typename VecTraits<VecD>::elem_type D;\r
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));\r
-            }\r
-        };\r
-        template <typename VecD> struct SatCastHelper<4, VecD>\r
+            typedef typename VecTraits<VecD>::elem_type D;\r
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));\r
+        }\r
+    };\r
+    template <typename VecD> struct SatCastHelper<3, VecD>\r
+    {\r
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
          {\r
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
-            {\r
-                typedef typename VecTraits<VecD>::elem_type D;\r
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));\r
-            }\r
-        };\r
-\r
-        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)\r
+            typedef typename VecTraits<VecD>::elem_type D;\r
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));\r
+        }\r
+    };\r
+    template <typename VecD> struct SatCastHelper<4, VecD>\r
+    {\r
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)\r
          {\r
-            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);\r
+            typedef typename VecTraits<VecD>::elem_type D;\r
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));\r
          }\r
+    };\r
+\r
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)\r
+    {\r
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);\r
      }\r
+}\r
  \r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
  \r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
  \r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
  \r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \\r
      __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \\r
@@ -150,49 +150,49 @@ namespace cv {  namespace gpu { namespace device
          return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \\r
      }\r
  \r
-    namespace detail\r
-    {    \r
-        template <typename T1, typename T2> struct BinOpTraits\r
-        {\r
-            typedef int argument_type;\r
-        };\r
-        template <typename T> struct BinOpTraits<T, T>\r
-        {\r
-            typedef T argument_type;\r
-        };\r
-        template <typename T> struct BinOpTraits<T, double>\r
-        {\r
-            typedef double argument_type;\r
-        };\r
-        template <typename T> struct BinOpTraits<double, T>\r
-        {\r
-            typedef double argument_type;\r
-        };\r
-        template <> struct BinOpTraits<double, double>\r
-        {\r
-            typedef double argument_type;\r
-        };\r
-        template <typename T> struct BinOpTraits<T, float>\r
-        {\r
-            typedef float argument_type;\r
-        };\r
-        template <typename T> struct BinOpTraits<float, T>\r
-        {\r
-            typedef float argument_type;\r
-        };\r
-        template <> struct BinOpTraits<float, float>\r
-        {\r
-            typedef float argument_type;\r
-        };\r
-        template <> struct BinOpTraits<double, float>\r
-        {\r
-            typedef double argument_type;\r
-        };\r
-        template <> struct BinOpTraits<float, double>\r
-        {\r
-            typedef double argument_type;\r
-        };\r
-    }\r
+namespace detail\r
+{    \r
+    template <typename T1, typename T2> struct BinOpTraits\r
+    {\r
+        typedef int argument_type;\r
+    };\r
+    template <typename T> struct BinOpTraits<T, T>\r
+    {\r
+        typedef T argument_type;\r
+    };\r
+    template <typename T> struct BinOpTraits<T, double>\r
+    {\r
+        typedef double argument_type;\r
+    };\r
+    template <typename T> struct BinOpTraits<double, T>\r
+    {\r
+        typedef double argument_type;\r
+    };\r
+    template <> struct BinOpTraits<double, double>\r
+    {\r
+        typedef double argument_type;\r
+    };\r
+    template <typename T> struct BinOpTraits<T, float>\r
+    {\r
+        typedef float argument_type;\r
+    };\r
+    template <typename T> struct BinOpTraits<float, T>\r
+    {\r
+        typedef float argument_type;\r
+    };\r
+    template <> struct BinOpTraits<float, float>\r
+    {\r
+        typedef float argument_type;\r
+    };\r
+    template <> struct BinOpTraits<double, float>\r
+    {\r
+        typedef double argument_type;\r
+    };\r
+    template <> struct BinOpTraits<float, double>\r
+    {\r
+        typedef double argument_type;\r
+    };\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \\r
      __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \\r
@@ -313,19 +313,20 @@ namespace cv {  namespace gpu { namespace device
      OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \\r
      OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)\r
  \r
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)\r
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)\r
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)\r
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)\r
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)\r
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)\r
-    OPENCV_GPU_IMPLEMENT_VEC_OP(float)\r
-    OPENCV_GPU_IMPLEMENT_VEC_OP(double)\r
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)\r
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)\r
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)\r
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)\r
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)\r
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)\r
+OPENCV_GPU_IMPLEMENT_VEC_OP(float)\r
+OPENCV_GPU_IMPLEMENT_VEC_OP(double)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP\r
  #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP\r
  #undef OPENCV_GPU_IMPLEMENT_VEC_OP\r
  #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP\r
-}}}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
          \r
  #endif // __OPENCV_GPU_VECMATH_HPP__
 \ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp

index 52bba13..979da2a 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -45,82 +45,82 @@
  \r
  #include "internal_shared.hpp"\r
  \r
-namespace cv { namespace gpu { namespace device\r
-{\r
-    template<typename T, int N> struct TypeVec;\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
  \r
-    struct __align__(8) uchar8\r
-    {\r
-        uchar a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)\r
-    {\r
-        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct __align__(8) char8\r
-    {\r
-        schar a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)\r
-    {\r
-        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct __align__(16) ushort8\r
-    {\r
-        ushort a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)\r
-    {\r
-        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct __align__(16) short8\r
-    {\r
-        short a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)\r
-    {\r
-        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct __align__(32) uint8\r
-    {\r
-        uint a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)\r
-    {\r
-        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct __align__(32) int8\r
-    {\r
-        int a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)\r
-    {\r
-        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct __align__(32) float8\r
-    {\r
-        float a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)\r
-    {\r
-        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
-    struct double8\r
-    {\r
-        double a0, a1, a2, a3, a4, a5, a6, a7;\r
-    };\r
-    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)\r
-    {\r
-        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
-        return val;\r
-    }\r
+template<typename T, int N> struct TypeVec;\r
+\r
+struct __align__(8) uchar8\r
+{\r
+    uchar a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)\r
+{\r
+    uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct __align__(8) char8\r
+{\r
+    schar a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)\r
+{\r
+    char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct __align__(16) ushort8\r
+{\r
+    ushort a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)\r
+{\r
+    ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct __align__(16) short8\r
+{\r
+    short a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)\r
+{\r
+    short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct __align__(32) uint8\r
+{\r
+    uint a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)\r
+{\r
+    uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct __align__(32) int8\r
+{\r
+    int a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)\r
+{\r
+    int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct __align__(32) float8\r
+{\r
+    float a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)\r
+{\r
+    float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
+struct double8\r
+{\r
+    double a0, a1, a2, a3, a4, a5, a6, a7;\r
+};\r
+static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)\r
+{\r
+    double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};\r
+    return val;\r
+}\r
  \r
  #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \\r
      template<> struct TypeVec<type, 1> { typedef type vec_type; }; \\r
@@ -134,28 +134,28 @@ namespace cv { namespace gpu { namespace device
      template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \\r
      template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };\r
  \r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)\r
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)\r
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC\r
  \r
-    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };\r
-    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };\r
-    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };\r
-    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };\r
-    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };\r
+template<> struct TypeVec<schar, 1> { typedef schar vec_type; };\r
+template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };\r
+template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };\r
+template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };\r
+template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };\r
  \r
-    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };\r
-    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };\r
-    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };\r
-    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };\r
-    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };\r
+template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };\r
+template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };\r
+template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };\r
+template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };\r
+template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };\r
  \r
      template<typename T> struct VecTraits;\r
  \r
@@ -209,72 +209,73 @@ namespace cv { namespace gpu { namespace device
          static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \\r
      };\r
  \r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)\r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)\r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)\r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)\r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)\r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)\r
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)\r
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)\r
  \r
  #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS\r
  \r
-    template<> struct VecTraits<char> \r
-    { \r
-        typedef char elem_type; \r
-        enum {cn=1}; \r
-        static __device__ __host__ __forceinline__ char all(char v) {return v;}\r
-        static __device__ __host__ __forceinline__ char make(char x) {return x;}\r
-        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}\r
-    };\r
-    template<> struct VecTraits<schar> \r
-    { \r
-        typedef schar elem_type; \r
-        enum {cn=1}; \r
-        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}\r
-        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}\r
-        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}\r
-    };\r
-    template<> struct VecTraits<char1>\r
-    {\r
-        typedef schar elem_type;\r
-        enum {cn=1};\r
-        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}\r
-        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}\r
-        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}\r
-    };\r
-    template<> struct VecTraits<char2>\r
-    {\r
-        typedef schar elem_type;\r
-        enum {cn=2};\r
-        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}\r
-        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}\r
-        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}\r
-    };\r
-    template<> struct VecTraits<char3>\r
-    {\r
-        typedef schar elem_type;\r
-        enum {cn=3};\r
-        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}\r
-        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}\r
-        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}\r
-    };\r
-    template<> struct VecTraits<char4>\r
-    {\r
-        typedef schar elem_type;\r
-        enum {cn=4};\r
-        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}\r
-        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}\r
-        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}\r
-    };\r
-    template<> struct VecTraits<char8>\r
-    {\r
-        typedef schar elem_type;\r
-        enum {cn=8};\r
-        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}\r
-        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}\r
-        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}\r
-    };\r
-}}}\r
+template<> struct VecTraits<char> \r
+{ \r
+    typedef char elem_type; \r
+    enum {cn=1}; \r
+    static __device__ __host__ __forceinline__ char all(char v) {return v;}\r
+    static __device__ __host__ __forceinline__ char make(char x) {return x;}\r
+    static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}\r
+};\r
+template<> struct VecTraits<schar> \r
+{ \r
+    typedef schar elem_type; \r
+    enum {cn=1}; \r
+    static __device__ __host__ __forceinline__ schar all(schar v) {return v;}\r
+    static __device__ __host__ __forceinline__ schar make(schar x) {return x;}\r
+    static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}\r
+};\r
+template<> struct VecTraits<char1>\r
+{\r
+    typedef schar elem_type;\r
+    enum {cn=1};\r
+    static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}\r
+    static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}\r
+    static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}\r
+};\r
+template<> struct VecTraits<char2>\r
+{\r
+    typedef schar elem_type;\r
+    enum {cn=2};\r
+    static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}\r
+    static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}\r
+    static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}\r
+};\r
+template<> struct VecTraits<char3>\r
+{\r
+    typedef schar elem_type;\r
+    enum {cn=3};\r
+    static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}\r
+    static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}\r
+    static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}\r
+};\r
+template<> struct VecTraits<char4>\r
+{\r
+    typedef schar elem_type;\r
+    enum {cn=4};\r
+    static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}\r
+    static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}\r
+    static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}\r
+};\r
+template<> struct VecTraits<char8>\r
+{\r
+    typedef schar elem_type;\r
+    enum {cn=8};\r
+    static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}\r
+    static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}\r
+    static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}\r
+};\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  #endif // __OPENCV_GPU_VEC_TRAITS_HPP__\r
diff --git a/modules/gpu/src/opencv2/gpu/device/warp.hpp b/modules/gpu/src/opencv2/gpu/device/warp.hpp

index cd9baf2..9abe6e4 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
@@ -40,79 +40,76 @@
  //\r
  //M*/\r
  \r
-#ifndef __OPENCV_GPU_DEVICE_WARP_HPP_\r
-#define __OPENCV_GPU_DEVICE_WARP_HPP_\r
+#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__\r
+#define __OPENCV_GPU_DEVICE_WARP_HPP__\r
  \r
-namespace cv\r
+#include "internal_shared.hpp"\r
+\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+struct Warp\r
  {\r
-    namespace gpu\r
+    enum\r
      {\r
-        namespace device\r
-        {\r
-            struct Warp\r
-            {\r
-                enum\r
-                {\r
-                    LOG_WARP_SIZE = 5,\r
-                    WARP_SIZE     = 1 << LOG_WARP_SIZE,\r
-                    STRIDE        = WARP_SIZE\r
-                };\r
+        LOG_WARP_SIZE = 5,\r
+        WARP_SIZE     = 1 << LOG_WARP_SIZE,\r
+        STRIDE        = WARP_SIZE\r
+    };\r
  \r
-                /** \brief Returns the warp lane ID of the calling thread. */\r
-                static __device__ __forceinline__ unsigned int laneId()\r
-                {\r
-                    unsigned int ret;\r
-                    asm("mov.u32 %0, %laneid;" : "=r"(ret) );\r
-                    return ret;\r
-                }\r
+    /** \brief Returns the warp lane ID of the calling thread. */\r
+    static __device__ __forceinline__ unsigned int laneId()\r
+    {\r
+        unsigned int ret;\r
+        asm("mov.u32 %0, %laneid;" : "=r"(ret) );\r
+        return ret;\r
+    }\r
  \r
-                template<typename It, typename T>\r
-                static __device__ __forceinline__ void fill(It beg, It end, const T& value)\r
-                {                \r
-                    for(It t = beg + laneId(); t < end; t += STRIDE)\r
-                        *t = value;\r
-                }            \r
+    template<typename It, typename T>\r
+    static __device__ __forceinline__ void fill(It beg, It end, const T& value)\r
+    {                \r
+        for(It t = beg + laneId(); t < end; t += STRIDE)\r
+            *t = value;\r
+    }            \r
  \r
-                template<typename InIt, typename OutIt>\r
-                static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)\r
-                {                \r
-                    for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
-                        *out = *t;\r
-                    return out;\r
-                }            \r
+    template<typename InIt, typename OutIt>\r
+    static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)\r
+    {                \r
+        for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
+            *out = *t;\r
+        return out;\r
+    }            \r
  \r
-                template<typename InIt, typename OutIt, class UnOp>\r
-                static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)\r
-                {\r
-                    for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
-                        *out = op(*t);\r
-                    return out;\r
-                }\r
+    template<typename InIt, typename OutIt, class UnOp>\r
+    static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)\r
+    {\r
+        for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)\r
+            *out = op(*t);\r
+        return out;\r
+    }\r
  \r
-                template<typename InIt1, typename InIt2, typename OutIt, class BinOp>\r
-                static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)\r
-                {\r
-                    unsigned int lane = laneId();\r
+    template<typename InIt1, typename InIt2, typename OutIt, class BinOp>\r
+    static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)\r
+    {\r
+        unsigned int lane = laneId();\r
  \r
-                    InIt1 t1 = beg1 + lane; \r
-                    InIt2 t2 = beg2 + lane;\r
-                    for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)\r
-                        *out = op(*t1, *t2);\r
-                    return out;\r
-                }\r
+        InIt1 t1 = beg1 + lane; \r
+        InIt2 t2 = beg2 + lane;\r
+        for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)\r
+            *out = op(*t1, *t2);\r
+        return out;\r
+    }\r
  \r
-                template<typename OutIt, typename T>\r
-                static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)\r
-                {\r
-                    unsigned int lane = laneId();                \r
-                    value += lane;\r
+    template<typename OutIt, typename T>\r
+    static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)\r
+    {\r
+        unsigned int lane = laneId();                \r
+        value += lane;\r
  \r
-                    for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)\r
-                        *t = value;\r
-                }\r
-            };\r
-        }\r
+        for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)\r
+            *t = value;\r
      }\r
-}\r
+};\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
-#endif /* __OPENCV_GPU_DEVICE_WARP_HPP_ */
-\ No newline at end of file
+#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
+\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp

index 2368676..91602df 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
@@ -41,33 +41,32 @@
  //M*/\r
  \r
  \r
-#ifndef OPENCV_GPU_WARP_REDUCE_HPP_\r
-#define OPENCV_GPU_WARP_REDUCE_HPP_\r
+#ifndef OPENCV_GPU_WARP_REDUCE_HPP__\r
+#define OPENCV_GPU_WARP_REDUCE_HPP__\r
  \r
+#include "internal_shared.hpp"\r
  \r
-namespace cv\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+               \r
+template <class T> \r
+__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )\r
  {\r
-       namespace device\r
-       {               \r
-                template <class T> \r
-        __device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )\r
-        {\r
-            const unsigned int lane = tid & 31; // index of thread in warp (0..31)\r
-                                               \r
-                       if (lane < 16)\r
-                       {                               \r
-                               T partial = ptr[tid];\r
+    const unsigned int lane = tid & 31; // index of thread in warp (0..31)\r
+                               \r
+       if (lane < 16)\r
+       {                               \r
+               T partial = ptr[tid];\r
  \r
-                               ptr[tid] = partial = partial + ptr[tid + 16];\r
-                               ptr[tid] = partial = partial + ptr[tid + 8];\r
-                               ptr[tid] = partial = partial + ptr[tid + 4];\r
-                               ptr[tid] = partial = partial + ptr[tid + 2];\r
-                               ptr[tid] = partial = partial + ptr[tid + 1];            \r
-                       }\r
-                       return ptr[tid - lane];\r
-\r
-        }\r
+               ptr[tid] = partial = partial + ptr[tid + 16];\r
+               ptr[tid] = partial = partial + ptr[tid + 8];\r
+               ptr[tid] = partial = partial + ptr[tid + 4];\r
+               ptr[tid] = partial = partial + ptr[tid + 2];\r
+               ptr[tid] = partial = partial + ptr[tid + 1];            \r
         }\r
+\r
+       return ptr[tid - lane];\r
  }\r
  \r
-#endif /* OPENCV_GPU_WARP_REDUCE_HPP_ */
-\ No newline at end of file
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+#endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
+\ No newline at end of file
diff --git a/modules/gpu/src/split_merge.cpp b/modules/gpu/src/split_merge.cpp

index a791862..4d05ee9 100644 (file)
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -42,6 +42,8 @@
  \r
  #include "precomp.hpp"\r
  \r
+using namespace cv;\r
+using namespace cv::gpu;\r
  using namespace std;\r
  \r
  #if !defined (HAVE_CUDA)\r
@@ -53,24 +55,24 @@ void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*st
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu { namespace split_merge \r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace split_merge \r
  {    \r
-    extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst, \r
-                                 int total_channels, size_t elem_size, \r
-                                 const cudaStream_t& stream);\r
+    void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);\r
+    void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);\r
+}\r
  \r
-    extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst, \r
-                                 int num_channels, size_t elem_size1, \r
-                                 const cudaStream_t& stream);\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
+namespace\r
+{\r
      void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) \r
      {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;\r
+\r
          CV_Assert(src);\r
          CV_Assert(n > 0);\r
-       \r
-        bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && \r
-                         DeviceInfo().supports(NATIVE_DOUBLE);\r
-        CV_Assert(src[0].depth() != CV_64F || double_ok);\r
  \r
          int depth = src[0].depth();\r
          Size size = src[0].size();\r
@@ -100,20 +102,15 @@ namespace cv { namespace gpu { namespace split_merge
                  src_as_devmem[i] = src[i];\r
  \r
              DevMem2Db dst_as_devmem(dst);\r
-            split_merge::merge_caller(src_as_devmem, dst_as_devmem,\r
-                                      total_channels, CV_ELEM_SIZE(depth),\r
-                                      stream);\r
+            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);\r
          }   \r
      }\r
  \r
-\r
      void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) \r
      {\r
-        CV_Assert(dst);\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;\r
  \r
-        bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && \r
-                         DeviceInfo().supports(NATIVE_DOUBLE);\r
-        CV_Assert(src.depth() != CV_64F || double_ok);\r
+        CV_Assert(dst);\r
  \r
          int depth = src.depth();\r
          int num_channels = src.channels();\r
@@ -135,38 +132,31 @@ namespace cv { namespace gpu { namespace split_merge
              dst_as_devmem[i] = dst[i];\r
  \r
          DevMem2Db src_as_devmem(src);\r
-        split_merge::split_caller(src_as_devmem, dst_as_devmem,\r
-                                  num_channels, src.elemSize1(), \r
-                                  stream);\r
+        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);\r
      }\r
-\r
-\r
-}}}\r
-\r
+}\r
  \r
  void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream) \r
  { \r
-    split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));\r
+    ::merge(src, n, dst, StreamAccessor::getStream(stream));\r
  }\r
  \r
  \r
  void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream) \r
  {\r
-    split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));\r
+    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));\r
  }\r
  \r
-\r
  void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream) \r
  {\r
-    split_merge::split(src, dst, StreamAccessor::getStream(stream));\r
+    ::split(src, dst, StreamAccessor::getStream(stream));\r
  }\r
  \r
-\r
  void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream) \r
  {\r
      dst.resize(src.channels());\r
      if(src.channels() > 0)\r
-        split_merge::split(src, &dst[0], StreamAccessor::getStream(stream));\r
+        ::split(src, &dst[0], StreamAccessor::getStream(stream));\r
  }\r
  \r
  #endif /* !defined (HAVE_CUDA) */\r
diff --git a/modules/gpu/src/stereobm.cpp b/modules/gpu/src/stereobm.cpp

index 8ab2d29..974b3dc 100644 (file)
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
@@ -55,21 +55,23 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace stereobm\r
  {\r
-    namespace bm\r
-    {\r
-        //extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);\r
-        extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);\r
-        extern "C" void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);\r
-        extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);\r
-    }\r
-}}\r
+    void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);\r
+    void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);\r
+    void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
  const float defaultAvgTexThreshold = 3;\r
  \r
  cv::gpu::StereoBM_GPU::StereoBM_GPU()\r
-    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)  {}\r
+    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)  \r
+{\r
+}\r
  \r
  cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_)\r
      : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold)\r
@@ -93,39 +95,44 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
      return false;\r
  }\r
  \r
-static void stereo_bm_gpu_operator ( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)\r
+namespace\r
  {\r
-    CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);\r
-    CV_DbgAssert(left.type() == CV_8UC1);\r
-    CV_DbgAssert(right.type() == CV_8UC1);\r
+    void stereo_bm_gpu_operator( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)\r
+    {\r
+        using namespace OPENCV_DEVICE_NAMESPACE_ stereobm;\r
  \r
-    disparity.create(left.size(), CV_8U);\r
-    minSSD.create(left.size(), CV_32S);\r
+        CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);\r
+        CV_DbgAssert(left.type() == CV_8UC1);\r
+        CV_DbgAssert(right.type() == CV_8UC1);\r
  \r
-    GpuMat le_for_bm =  left;\r
-    GpuMat ri_for_bm = right;\r
+        disparity.create(left.size(), CV_8U);\r
+        minSSD.create(left.size(), CV_32S);\r
  \r
-    if (preset == StereoBM_GPU::PREFILTER_XSOBEL)\r
-    {\r
-        leBuf.create( left.size(),  left.type());\r
-        riBuf.create(right.size(), right.type());\r
+        GpuMat le_for_bm =  left;\r
+        GpuMat ri_for_bm = right;\r
  \r
-               bm::prefilter_xsobel( left, leBuf, 31, stream);\r
-        bm::prefilter_xsobel(right, riBuf, 31, stream);\r
+        if (preset == StereoBM_GPU::PREFILTER_XSOBEL)\r
+        {\r
+            leBuf.create( left.size(),  left.type());\r
+            riBuf.create(right.size(), right.type());\r
  \r
-        le_for_bm = leBuf;\r
-        ri_for_bm = riBuf;\r
-    }\r
+                   prefilter_xsobel( left, leBuf, 31, stream);\r
+            prefilter_xsobel(right, riBuf, 31, stream);\r
+\r
+            le_for_bm = leBuf;\r
+            ri_for_bm = riBuf;\r
+        }\r
  \r
-    bm::stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);\r
+        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);\r
  \r
-    if (avergeTexThreshold)\r
-        bm::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);\r
+        if (avergeTexThreshold)\r
+            postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);\r
+    }\r
  }\r
  \r
  void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)\r
  {\r
-    ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));\r
+    stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));\r
  }\r
  \r
  #endif /* !defined (HAVE_CUDA) */\r
diff --git a/modules/gpu/src/stereobp.cpp b/modules/gpu/src/stereobp.cpp

index 288e2f8..7cc960a 100644 (file)
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
@@ -59,7 +59,9 @@ void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu { namespace bp\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace stereobp\r
  {\r
      void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);\r
      template<typename T, typename D>\r
@@ -74,7 +76,11 @@ namespace cv { namespace gpu { namespace bp
      template <typename T>\r
      void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, \r
          const DevMem2D_<short>& disp, cudaStream_t stream);\r
-}}}\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+using namespace OPENCV_DEVICE_NAMESPACE_ stereobp;\r
  \r
  namespace\r
  {\r
@@ -84,7 +90,6 @@ namespace
      const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;\r
  }\r
  \r
-\r
  void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)\r
  {\r
      ndisp = width / 4;\r
@@ -136,8 +141,8 @@ namespace
              typedef void (*comp_data_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);\r
              static const comp_data_t comp_data_callers[2][5] = \r
              {\r
-                {0, bp::comp_data_gpu<unsigned char, short>, 0, bp::comp_data_gpu<uchar3, short>, bp::comp_data_gpu<uchar4, short>},\r
-                {0, bp::comp_data_gpu<unsigned char, float>, 0, bp::comp_data_gpu<uchar3, float>, bp::comp_data_gpu<uchar4, float>}\r
+                {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},\r
+                {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}\r
              };\r
  \r
              CV_Assert(left.size() == right.size() && left.type() == right.type());\r
@@ -236,7 +241,7 @@ namespace
                  }\r
              }\r
  \r
-            bp::load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);\r
+            load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);\r
  \r
              datas.resize(rthis.levels);\r
  \r
@@ -249,8 +254,6 @@ namespace
  \r
          void calcBP(GpuMat& disp, Stream& stream)\r
          {\r
-            using namespace cv::gpu::bp;\r
-\r
              typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);\r
              static const data_step_down_t data_step_down_callers[2] = \r
              {\r
@@ -354,13 +357,13 @@ namespace
  \r
  void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)\r
  {\r
-    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);\r
+    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);\r
      impl(left, right, disp, stream);\r
  }\r
  \r
  void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)\r
  {\r
-    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);\r
+    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);\r
      impl(data, disp, stream);\r
  }\r
  \r
diff --git a/modules/gpu/src/stereocsbp.cpp b/modules/gpu/src/stereocsbp.cpp

index 85faaf8..a0c8a7f 100644 (file)
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@@ -57,7 +57,9 @@ void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, Gp
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu { namespace csbp\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace stereocsbp\r
  {\r
      void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,\r
          const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);\r
@@ -84,8 +86,11 @@ namespace cv { namespace gpu { namespace csbp
      template<class T> \r
      void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,\r
          const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);\r
+}\r
+\r
+END_OPENCV_DEVICE_NAMESPACE\r
  \r
-}}}\r
+using namespace OPENCV_DEVICE_NAMESPACE_ stereocsbp;\r
  \r
  namespace\r
  {\r
@@ -208,8 +213,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
      ////////////////////////////////////////////////////////////////////////////\r
      // Compute\r
  \r
-    csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,\r
-        rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);\r
+    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);\r
  \r
      if (stream)\r
      {\r
@@ -248,28 +252,28 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
      {\r
          if (i == levels - 1)\r
          {\r
-            csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),\r
+            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),\r
                  step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);\r
          }\r
          else\r
          {\r
-            csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],\r
+            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],\r
                  left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);\r
  \r
              int new_idx = (cur_idx + 1) & 1;\r
  \r
-            csbp::init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),\r
-                               u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),\r
-                               disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),\r
-                               data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],\r
-                               cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);\r
+            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),\r
+                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),\r
+                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),\r
+                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],\r
+                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);\r
  \r
              cur_idx = new_idx;\r
          }\r
  \r
-        csbp::calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),\r
-                                  data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],\r
-                                  rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);\r
+        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),\r
+                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],\r
+                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);\r
      }\r
  \r
      if (disp.empty())\r
@@ -282,8 +286,8 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
      else\r
          out.setTo(zero);\r
  \r
-    csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),\r
-                       data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);\r
+    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),\r
+                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);\r
  \r
      if (disp.type() != CV_16S)\r
      {\r
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp

index dba9696..3512844 100644 (file)
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -63,8 +63,17 @@ void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
  \r
  #else /* !defined (HAVE_CUDA) */\r
  \r
-namespace cv { namespace gpu { namespace surf\r
+BEGIN_OPENCV_DEVICE_NAMESPACE\r
+\r
+namespace surf\r
  {\r
+    void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);\r
+    void loadOctaveConstants(int octave, int layer_rows, int layer_cols);\r
+\r
+    void bindImgTex(DevMem2Db img);\r
+    void bindSumTex(DevMem2D_<uint> sum);\r
+    void bindMaskSumTex(DevMem2D_<uint> maskSum);\r
+\r
      void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);\r
  \r
      void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,\r
@@ -78,9 +87,11 @@ namespace cv { namespace gpu { namespace surf
  \r
      void compute_descriptors_gpu(const DevMem2Df& descriptors, \r
          const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);\r
-}}}\r
+}\r
  \r
-using namespace cv::gpu::surf;\r
+END_OPENCV_DEVICE_NAMESPACE\r
+\r
+using namespace OPENCV_DEVICE_NAMESPACE_ surf;\r
  \r
  namespace\r
  {\r
@@ -136,24 +147,18 @@ namespace
              counters.create(1, nOctaves + 1, CV_32SC1);\r
              counters.setTo(Scalar::all(0));\r
  \r
-            uploadConstant("cv::gpu::surf::c_max_candidates",    maxCandidates);\r
-            uploadConstant("cv::gpu::surf::c_max_features",      maxFeatures);\r
-            uploadConstant("cv::gpu::surf::c_img_rows",          img_rows);\r
-            uploadConstant("cv::gpu::surf::c_img_cols",          img_cols);\r
-            uploadConstant("cv::gpu::surf::c_nOctaveLayers",     nOctaveLayers);\r
-            uploadConstant("cv::gpu::surf::c_hessianThreshold",  static_cast<float>(hessianThreshold));\r
+            loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, nOctaveLayers, static_cast<float>(hessianThreshold));\r
  \r
-            imgTex.bind("cv::gpu::surf::imgTex", (DevMem2Db)img);\r
+            bindImgTex(img);\r
  \r
              integralBuffered(img, sum, intBuffer);\r
-            sumTex.bind("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);\r
+            bindSumTex(sum);\r
  \r
              if (use_mask)\r
              {\r
                  min(mask, 1.0, mask1);\r
                  integralBuffered(mask1, maskSum, intBuffer);\r
-\r
-                maskSumTex.bind("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);\r
+                bindMaskSumTex(maskSum);\r
              }\r
          }\r
  \r
@@ -171,9 +176,7 @@ namespace
                  const int layer_rows = img_rows >> octave;\r
                  const int layer_cols = img_cols >> octave;\r
  \r
-                uploadConstant("cv::gpu::surf::c_octave",     octave);\r
-                uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);\r
-                uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);\r
+                loadOctaveConstants(octave, layer_rows, layer_cols);\r
  \r
                  icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);\r
  \r
@@ -242,8 +245,6 @@ namespace
          int maxFeatures;\r
  \r
          GpuMat counters;\r
-\r
-        TextureBinder imgTex, sumTex, maskSumTex;\r
      };\r
  }\r
  \r
@@ -336,7 +337,7 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
      {\r
          CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE);\r
          \r
-        Mat keypointsCPU = keypointsGPU;\r
+        Mat keypointsCPU(keypointsGPU);\r
          \r
          keypoints.resize(nFeatures);\r
  \r
diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp

index 7392daf..e9b2ef8 100644 (file)
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -549,8 +549,8 @@ TEST_P(MorphEx, Accuracy)
          cv::gpu::GpuMat dev_dst_rgba;\r
          cv::gpu::GpuMat dev_dst_gray;\r
  \r
-        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_rgba), dev_dst_rgba, morphOps[morphOpsIdx], cv::gpu::GpuMat(kernel));\r
-        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_gray), dev_dst_gray, morphOps[morphOpsIdx], cv::gpu::GpuMat(kernel));\r
+        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_rgba), dev_dst_rgba, morphOps[morphOpsIdx], kernel);\r
+        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_gray), dev_dst_gray, morphOps[morphOpsIdx], kernel);\r
  \r
          dev_dst_rgba.download(dst_rgba);\r
          dev_dst_gray.download(dst_gray);\r
diff --git a/modules/gpu/test/test_hog.cpp b/modules/gpu/test/test_hog.cpp

index 548e3e5..c8afa05 100644 (file)
--- a/modules/gpu/test/test_hog.cpp
+++ b/modules/gpu/test/test_hog.cpp
@@ -137,7 +137,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
  #ifdef DUMP\r
          dump(block_hists, locations);\r
  #else\r
-        compare(block_hists, locations);\r
+        compare(cv::Mat(block_hists), locations);\r
  #endif\r
  \r
          // Test detect on smaller image\r
@@ -148,7 +148,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
  #ifdef DUMP\r
          dump(block_hists, locations);\r
  #else\r
-        compare(block_hists, locations);\r
+        compare(cv::Mat(block_hists), locations);\r
  #endif\r
  \r
          // Test detect on greater image\r
@@ -158,7 +158,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
  #ifdef DUMP\r
          dump(block_hists, locations);\r
  #else\r
-        compare(block_hists, locations);\r
+        compare(cv::Mat(block_hists), locations);\r
  #endif\r
      }\r
  \r
@@ -254,31 +254,31 @@ struct CV_GpuHogGetDescriptorsTestRunner : cv::gpu::HOGDescriptor
          ASSERT_TRUE(!img_rgb.empty());\r
          cv::cvtColor(img_rgb, img, CV_BGR2BGRA);\r
          computeBlockHistograms(cv::gpu::GpuMat(img));\r
-        compare_inner_parts(block_hists, descriptors.rowRange(1, 2));\r
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));\r
  \r
          img_rgb = readImage("hog/negative1.png");\r
          ASSERT_TRUE(!img_rgb.empty());\r
          cv::cvtColor(img_rgb, img, CV_BGR2BGRA);\r
          computeBlockHistograms(cv::gpu::GpuMat(img));\r
-        compare_inner_parts(block_hists, descriptors.rowRange(2, 3));\r
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));\r
  \r
          img_rgb = readImage("hog/negative2.png");\r
          ASSERT_TRUE(!img_rgb.empty());\r
          cv::cvtColor(img_rgb, img, CV_BGR2BGRA);\r
          computeBlockHistograms(cv::gpu::GpuMat(img));\r
-        compare_inner_parts(block_hists, descriptors.rowRange(3, 4));\r
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));\r
  \r
          img_rgb = readImage("hog/positive3.png");\r
          ASSERT_TRUE(!img_rgb.empty());\r
          cv::cvtColor(img_rgb, img, CV_BGR2BGRA);\r
          computeBlockHistograms(cv::gpu::GpuMat(img));\r
-        compare_inner_parts(block_hists, descriptors.rowRange(4, 5));\r
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));\r
  \r
          img_rgb = readImage("hog/negative3.png");\r
          ASSERT_TRUE(!img_rgb.empty());\r
          cv::cvtColor(img_rgb, img, CV_BGR2BGRA);\r
          computeBlockHistograms(cv::gpu::GpuMat(img));\r
-        compare_inner_parts(block_hists, descriptors.rowRange(5, 6));\r
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));\r
      }\r
  \r
      // Does not compare border value, as interpolation leads to delta\r
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp

index 3e8abf8..85c5693 100644 (file)
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -3897,7 +3897,7 @@ static void testC2C(const std::string& hint, int cols, int rows, int flags, bool
      EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());\r
      ASSERT_EQ(CV_32F, d_b.depth());\r
      ASSERT_EQ(2, d_b.channels());\r
-    EXPECT_MAT_NEAR(b_gold, d_b, rows * cols * 1e-4);\r
+    EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);\r
  }\r
  \r
  TEST_P(Dft, C2C)\r
diff --git a/samples/gpu/stereo_match.cpp b/samples/gpu/stereo_match.cpp

index 1dce494..39aef1c 100644 (file)
--- a/samples/gpu/stereo_match.cpp
+++ b/samples/gpu/stereo_match.cpp
@@ -206,7 +206,7 @@ void App::run()
          workEnd();\r
  \r
          // Show results\r
-        disp = d_disp;\r
+        d_disp.download(disp);\r
          putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));\r
          imshow("disparity", disp);\r
  \r
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp

index b020408..f87fe5f 100644 (file)
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -1,29 +1,29 @@
-#include <iostream>
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/features2d/features2d.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void help()
-{
-    cout << "\nThis program demonstrates using SURF_GPU features detector, descriptor extractor and BruteForceMatcher_GPU" << endl;
-    cout << "\nUsage:\n\tmatcher_simple_gpu --left <image1> --right <image2>" << endl;
-}
-
-int main(int argc, char* argv[])
-{
-    if (argc != 5)
-    {
-        help();
-        return -1;
-    }
-
-    GpuMat img1, img2;
+#include <iostream>\r
+\r
+#include "opencv2/core/core.hpp"\r
+#include "opencv2/features2d/features2d.hpp"\r
+#include "opencv2/highgui/highgui.hpp"\r
+#include "opencv2/gpu/gpu.hpp"\r
+\r
+using namespace std;\r
+using namespace cv;\r
+using namespace cv::gpu;\r
+\r
+void help()\r
+{\r
+    cout << "\nThis program demonstrates using SURF_GPU features detector, descriptor extractor and BruteForceMatcher_GPU" << endl;\r
+    cout << "\nUsage:\n\tmatcher_simple_gpu --left <image1> --right <image2>" << endl;\r
+}\r
+\r
+int main(int argc, char* argv[])\r
+{\r
+    if (argc != 5)\r
+    {\r
+        help();\r
+        return -1;\r
+    }\r
+\r
+    GpuMat img1, img2;\r
      for (int i = 1; i < argc; ++i)\r
      {\r
          if (string(argv[i]) == "--left")\r
@@ -41,41 +41,41 @@ int main(int argc, char* argv[])
              help();\r
              return -1;\r
          }\r
-    }
-
-    SURF_GPU surf;
-
-    // detecting keypoints & computing descriptors
-    GpuMat keypoints1GPU, keypoints2GPU;
-    GpuMat descriptors1GPU, descriptors2GPU;
-    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
-    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
-    
-    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
-    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
-
-    // matching descriptors
-    BruteForceMatcher_GPU< L2<float> > matcher;
-    GpuMat trainIdx, distance;
-    matcher.matchSingle(descriptors1GPU, descriptors2GPU, trainIdx, distance);
-    
-    // downloading results
-    vector<KeyPoint> keypoints1, keypoints2;
-    vector<float> descriptors1, descriptors2;
-    vector<DMatch> matches;
-    surf.downloadKeypoints(keypoints1GPU, keypoints1);
-    surf.downloadKeypoints(keypoints2GPU, keypoints2);
-    surf.downloadDescriptors(descriptors1GPU, descriptors1);
-    surf.downloadDescriptors(descriptors2GPU, descriptors2);
-    BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);
-
-    // drawing the results
-    Mat img_matches;
-    drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
-    
-    namedWindow("matches", 0);
-    imshow("matches", img_matches);
-    waitKey(0);
-
-    return 0;
-}
+    }\r
+\r
+    SURF_GPU surf;\r
+\r
+    // detecting keypoints & computing descriptors\r
+    GpuMat keypoints1GPU, keypoints2GPU;\r
+    GpuMat descriptors1GPU, descriptors2GPU;\r
+    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);\r
+    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);\r
+    \r
+    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;\r
+    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;\r
+\r
+    // matching descriptors\r
+    BruteForceMatcher_GPU< L2<float> > matcher;\r
+    GpuMat trainIdx, distance;\r
+    matcher.matchSingle(descriptors1GPU, descriptors2GPU, trainIdx, distance);\r
+    \r
+    // downloading results\r
+    vector<KeyPoint> keypoints1, keypoints2;\r
+    vector<float> descriptors1, descriptors2;\r
+    vector<DMatch> matches;\r
+    surf.downloadKeypoints(keypoints1GPU, keypoints1);\r
+    surf.downloadKeypoints(keypoints2GPU, keypoints2);\r
+    surf.downloadDescriptors(descriptors1GPU, descriptors1);\r
+    surf.downloadDescriptors(descriptors2GPU, descriptors2);\r
+    BruteForceMatcher_GPU< L2<float> >::matchDownload(trainIdx, distance, matches);\r
+\r
+    // drawing the results\r
+    Mat img_matches;\r
+    drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);\r
+    \r
+    namedWindow("matches", 0);\r
+    imshow("matches", img_matches);\r
+    waitKey(0);\r
+\r
+    return 0;\r
+}\r
author	Vladislav Vinogradov <no@email>
	Wed, 9 Nov 2011 13:13:52 +0000 (13:13 +0000)
committer	Vladislav Vinogradov <no@email>
	Wed, 9 Nov 2011 13:13:52 +0000 (13:13 +0000)
modules/core/include/opencv2/core/core.hpp		patch \| blob \| history
modules/core/include/opencv2/core/devmem2d.hpp	[new file with mode: 0644]	patch \| blob
modules/core/include/opencv2/core/gpumat.hpp	[new file with mode: 0644]	patch \| blob
modules/core/src/gpumat.cpp	[new file with mode: 0644]	patch \| blob
modules/gpu/CMakeLists.txt		patch \| blob \| history
modules/gpu/include/opencv2/gpu/devmem2d.hpp		patch \| blob \| history
modules/gpu/include/opencv2/gpu/gpu.hpp		patch \| blob \| history
modules/gpu/include/opencv2/gpu/gpumat.hpp		patch \| blob \| history
modules/gpu/include/opencv2/gpu/matrix_operations.hpp	[deleted file]	patch \| blob \| history
modules/gpu/perf/perf_arithm.cpp		patch \| blob \| history
modules/gpu/perf/perf_calib3d.cpp		patch \| blob \| history
modules/gpu/perf/perf_filters.cpp		patch \| blob \| history
modules/gpu/perf/perf_imgproc.cpp		patch \| blob \| history
modules/gpu/perf/perf_matop.cpp		patch \| blob \| history
modules/gpu/src/arithm.cpp		patch \| blob \| history
modules/gpu/src/bilateral_filter.cpp		patch \| blob \| history
modules/gpu/src/blend.cpp		patch \| blob \| history
modules/gpu/src/brute_force_matcher.cpp		patch \| blob \| history
modules/gpu/src/calib3d.cpp		patch \| blob \| history
modules/gpu/src/cascadeclassifier.cpp		patch \| blob \| history
modules/gpu/src/color.cpp		patch \| blob \| history
modules/gpu/src/cuda/bf_knnmatch.cu		patch \| blob \| history
modules/gpu/src/cuda/bf_match.cu		patch \| blob \| history
modules/gpu/src/cuda/bf_radius_match.cu		patch \| blob \| history
modules/gpu/src/cuda/bilateral_filter.cu		patch \| blob \| history
modules/gpu/src/cuda/blend.cu		patch \| blob \| history
modules/gpu/src/cuda/calib3d.cu		patch \| blob \| history
modules/gpu/src/cuda/canny.cu		patch \| blob \| history
modules/gpu/src/cuda/color.cu		patch \| blob \| history
modules/gpu/src/cuda/column_filter.cu		patch \| blob \| history
modules/gpu/src/cuda/copy_make_border.cu		patch \| blob \| history
modules/gpu/src/cuda/element_operations.cu		patch \| blob \| history
modules/gpu/src/cuda/hist.cu		patch \| blob \| history
modules/gpu/src/cuda/hog.cu		patch \| blob \| history
modules/gpu/src/cuda/imgproc.cu		patch \| blob \| history
modules/gpu/src/cuda/internal_shared.hpp		patch \| blob \| history
modules/gpu/src/cuda/match_template.cu		patch \| blob \| history
modules/gpu/src/cuda/mathfunc.cu		patch \| blob \| history
modules/gpu/src/cuda/matrix_operations.cu		patch \| blob \| history
modules/gpu/src/cuda/matrix_reductions.cu		patch \| blob \| history
modules/gpu/src/cuda/pyr_down.cu		patch \| blob \| history
modules/gpu/src/cuda/pyr_up.cu		patch \| blob \| history
modules/gpu/src/cuda/remap.cu		patch \| blob \| history
modules/gpu/src/cuda/resize.cu		patch \| blob \| history
modules/gpu/src/cuda/row_filter.cu		patch \| blob \| history
modules/gpu/src/cuda/safe_call.hpp		patch \| blob \| history
modules/gpu/src/cuda/split_merge.cu		patch \| blob \| history
modules/gpu/src/cuda/stereobm.cu		patch \| blob \| history
modules/gpu/src/cuda/stereobp.cu		patch \| blob \| history
modules/gpu/src/cuda/stereocsbp.cu		patch \| blob \| history
modules/gpu/src/cuda/surf.cu		patch \| blob \| history
modules/gpu/src/cudastream.cpp		patch \| blob \| history
modules/gpu/src/element_operations.cpp		patch \| blob \| history
modules/gpu/src/filtering.cpp		patch \| blob \| history
modules/gpu/src/gpumat.cpp	[deleted file]	patch \| blob \| history
modules/gpu/src/hog.cpp		patch \| blob \| history
modules/gpu/src/imgproc.cpp		patch \| blob \| history
modules/gpu/src/initialization.cpp		patch \| blob \| history
modules/gpu/src/match_template.cpp		patch \| blob \| history
modules/gpu/src/matrix_operations.cpp		patch \| blob \| history
modules/gpu/src/matrix_reductions.cpp		patch \| blob \| history
modules/gpu/src/mssegmentation.cpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/color.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/emulation.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/filters.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/funcattrib.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/functional.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/limits.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/transform.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/type_traits.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/utility.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/vec_math.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/vec_traits.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/warp.hpp		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp		patch \| blob \| history
modules/gpu/src/split_merge.cpp		patch \| blob \| history
modules/gpu/src/stereobm.cpp		patch \| blob \| history
modules/gpu/src/stereobp.cpp		patch \| blob \| history
modules/gpu/src/stereocsbp.cpp		patch \| blob \| history
modules/gpu/src/surf.cpp		patch \| blob \| history
modules/gpu/test/test_filters.cpp		patch \| blob \| history
modules/gpu/test/test_hog.cpp		patch \| blob \| history
modules/gpu/test/test_imgproc.cpp		patch \| blob \| history
samples/gpu/stereo_match.cpp		patch \| blob \| history
samples/gpu/surf_keypoint_matcher.cpp		patch \| blob \| history